def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 7, grid_size, grid_size) .permute(0, 1, 3, 4, 2) .contiguous() ) # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height im = prediction[..., 4] # angle imaginary part re = prediction[..., 5] # angle real part pred_conf = torch.sigmoid(prediction[..., 6]) # Conf pred_cls = torch.sigmoid(prediction[..., 7:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :6].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h pred_boxes[..., 4] = im pred_boxes[..., 5] = re output = torch.cat( ( #pred_boxes.view(num_samples, -1, 6) * self.stride, pred_boxes[..., :4].view(num_samples, -1, 4) * self.stride, pred_boxes[..., 4:].view(num_samples, -1, 2), pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: # Kevin: Adding this try catch to make sure when ious is empty in # build_targets (look at utils/utils.py), this function knows how to # handle and return (output, 0) instead. try: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tim, tre, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_im = self.mse_loss(im[obj_mask], tim[obj_mask]) loss_re = self.mse_loss(re[obj_mask], tre[obj_mask]) loss_eular = loss_im + loss_re loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_eular + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "im": to_cpu(loss_im).item(), "re": to_cpu(loss_re).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss except RuntimeError as err: print(err) return output, 0
def forward(self, x, targets=None): # x is the output with linear activation before yolo layer # x.size() (bs, num_anchors*(5+num_classes), g_dim, g_dim) bs = x.size(0) g_dim = x.size(2) stride = self.img_dim / g_dim # Tensors for cuda support if x.is_cuda: FloatTensor = torch.cuda.FloatTensor LongTensor = torch.cuda.LongTensor else: FloatTensor = torch.FloatTensor LongTensor = torch.LongTensor prediction = x.view(bs, self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous() # Get outputs (offset) x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid grid = torch.linspace(0, g_dim - 1, g_dim).repeat(g_dim, 1) grid_x = grid.repeat(bs * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) grid_y = grid.t().repeat(bs * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors] anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim * g_dim).view(w.shape) anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim * g_dim).view(h.shape) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() scaled_all_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.all_anchors] (nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls) = build_targets(pred_boxes.cpu().data, targets.cpu().data, scaled_anchors, scaled_all_anchors, self.num_anchors, self.num_classes, g_dim, self.ignore_thres, self.img_dim) # nProposals = int((conf > 0.25).sum().item()) recall = float(nCorrect / nGT) if nGT else 1 # Handle masks mask = Variable(mask.type(FloatTensor)) # loc cls_mask = Variable( mask.unsqueeze(-1).repeat( 1, 1, 1, 1, self.num_classes).type(FloatTensor)) # cls conf_mask = Variable(conf_mask.type(FloatTensor)) # neg conf # number of positives is less than that of negatives # so the loss need to be balanced # For loc_loss, cls_loss, should be 1/num_pos # For conf_loss, should be 1/(num_pos + num_neg) # Ignored boxes does not trigger any loss balanced = False num_positive_box = torch.sum(mask.view(bs, -1), -1).view( bs, 1, 1, 1) + 1e-16 num_negative_box = torch.sum(conf_mask.view(bs, -1), -1).view( bs, 1, 1, 1) + 1e-16 # Handle target variables # (nB, nA, dim, dim) tx = Variable(tx.type(FloatTensor), requires_grad=False) # (nB, nA, dim, dim) ty = Variable(ty.type(FloatTensor), requires_grad=False) # (nB, nA, dim, dim) tw = Variable(tw.type(FloatTensor), requires_grad=False) # (nB, nA, dim, dim) th = Variable(th.type(FloatTensor), requires_grad=False) # (nB, nA, dim, dim) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) # (nB, nA, dim, dim, nC) tcls = Variable(tcls.type(FloatTensor), requires_grad=False) # box_loss_scale = Variable(box_loss_scale.type(FloatTensor), # requires_grad=False) # loc loss loss_x = torch.sum( (1 / num_positive_box * self.mse_loss(x, tx))[mask == 1]) / bs loss_y = torch.sum( (1 / num_positive_box * self.mse_loss(y, ty))[mask == 1]) / bs # width height loss, mse (vanilla yolov3) or smthl1 (ours) loss_w = torch.sum(( 1 / num_positive_box * # box_loss_scale * self.mse_loss(w, tw))[mask == 1]) / bs loss_h = torch.sum(( 1 / num_positive_box * # box_loss_scale * self.mse_loss(h, th))[mask == 1]) / bs loss_x *= 1 # self.alpha loss_y *= 1 # self.alpha loss_w *= 1 # self.alpha loss_h *= 1 # self.alpha # cls loss num_cls_each_box = torch.zeros(bs, self.num_anchors, g_dim, g_dim).type(FloatTensor) + 1e-16 # bs, nBoxes, nC if balanced: num_ref = torch.sum(tcls.reshape(bs, -1, self.num_classes), 1) for bs_ind in range(bs): for cls_ind in range(self.num_classes): boxes_ = (tcls[bs_ind][..., cls_ind] == 1) num_cls_each_box[bs_ind][boxes_] = num_ref[bs_ind, cls_ind] num_cls_each_box = num_cls_each_box.unsqueeze(-1) loss_cls = torch.sum((1 / num_cls_each_box * self.bce_loss( pred_cls, tcls))[cls_mask == 1]) / (bs * self.num_classes) else: if cls_mask.max().item() == 0.: loss_cls = torch.sum( self.bce_loss(pred_cls, tcls)[cls_mask == 1]) else: loss_cls = torch.mean( self.bce_loss(pred_cls, tcls)[cls_mask == 1]) # conf loss if balanced: conf_balance = [num_positive_box, num_negative_box] else: conf_balance = [ num_positive_box + num_negative_box, num_positive_box + num_negative_box ] loss_conf_all = self.bce_loss(conf, tconf) loss_conf_pos = torch.sum( (1 / conf_balance[0] * loss_conf_all)[mask == 1]) loss_conf_neg = torch.sum( (1 / conf_balance[1] * loss_conf_all)[conf_mask == 1]) loss_conf = (loss_conf_pos + loss_conf_neg) / bs loss = loss_x + loss_y + loss_w + loss_h + loss_cls + loss_conf return (loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), recall) else: # If not in training phase return predictions output = torch.cat( (pred_boxes.view(bs, -1, 4) * stride, conf.view( bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1) return output.data
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) if not obj_mask.any(): total_loss = self.noobj_scale * self.bce_loss( pred_conf[noobj_mask], tconf[noobj_mask]) return output, total_loss # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) tconf = obj_mask.float() loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, cls_targets=None): layer_outputs, dqnyolo_outputs = [], [] backbone_ind = -1 if cls_targets is not None: tar_conf, tar_cls, obj_mask, no_obj_mask = build_targets( cls_targets) for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): if module_def["type"] == "resnet": x = module(x) if module_def["type"] in ["convolutional", "maxpool"]: x = module(x) elif module_def["type"] == "shortcut": layer_i = int(module_def["from"]) x = layer_outputs[-1] + layer_outputs[layer_i] elif module_def["type"] == "cls_conv": layer_i = int(module_def["from"]) if layer_i != 0: # x = layer_outputs[layer_i] x = layer_outputs[backbone_ind] # print('input_x_shape', x.shape) x = module(x) # print('output_x_shape', x.shape) # Calculate cls_loss out = int(module_def["out"]) if out == 1: conf_cls_output_ind = len(layer_outputs) if out and cls_targets is not None: # print("x.size()") # print(x.size()) pred_conf_cls = x.permute(0, 2, 3, 1) pred_conf = pred_conf_cls[:, :, :, 0] pred_conf = torch.sigmoid(pred_conf) pred_cls = pred_conf_cls[:, :, :, 1:] pred_cls = torch.sigmoid(pred_cls) # print("pred_conf") # print(pred_conf) # print(pred_conf.size()) # print("tar_conf") # print(tar_conf) # print(tar_conf.size()) # print("obj_mask") # print(obj_mask) # print(obj_mask.size()) # print("pred_conf[obj_mask]") # print(pred_conf[obj_mask]) # print(pred_conf[obj_mask].size()) # print("tar_conf[obj_mask]") # print(tar_conf[obj_mask]) # print(tar_conf[obj_mask].size()) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tar_conf[obj_mask]) # print("no_obj_mask") # print(no_obj_mask) # print(no_obj_mask.size()) # print("pred_conf[no_obj_mask]") # print(pred_conf[no_obj_mask]) # print(pred_conf[no_obj_mask].size()) # print("tar_conf[no_obj_mask]") # print(tar_conf[no_obj_mask]) # print(tar_conf[no_obj_mask].size()) loss_conf_noobj = self.bce_loss(pred_conf[no_obj_mask], tar_conf[no_obj_mask]) # print(pred_conf[0, :, :]) # print(tar_conf[0, :, :]) # print("\nloss_conf_obj: ", self.obj_scale * loss_conf_obj.item()) # print("loss_conf_noobj: ", self.noobj_scale * loss_conf_noobj.item()) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj # print("tar_cls") # print(tar_cls) # print(tar_cls.size()) # print("pred_cls") # print(pred_cls) # print(pred_cls.size()) # print("obj_mask") # print(obj_mask) # print(obj_mask.size()) # print("pred_cls[obj_mask]") # print(pred_cls[obj_mask]) # print(pred_cls[obj_mask].size()) # print("tar_cls[obj_mask]") # print(tar_cls[obj_mask]) # print(tar_cls[obj_mask].size()) # print(pred_cls[0, 6, 6, :]) # print(tar_cls[0, 6, 6, :]) loss_cls = self.bce_loss(pred_cls[obj_mask], tar_cls[obj_mask]) # print("\nloss_conf: ", self.conf_scale * loss_conf.item()) # print("loss_cls: ", self.cls_scale * loss_cls.item()) loss_conf_cls = self.conf_scale * loss_conf + self.cls_scale * loss_cls # print("\nloss_loc: ", self.loc_scale * loss_loc.item()) # print("loss_conf_cls: ", self.conf_cls_scale * loss_conf_cls.item()) layer_outputs.append(x) # print('layer_outputs', i, len(layer_outputs)) dqnyolo_conf_cls_outputs = layer_outputs[conf_cls_output_ind] if cls_targets is None: return dqnyolo_conf_cls_outputs else: return loss_conf_cls, dqnyolo_conf_cls_outputs
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) logger.info( f"YOLOLayer input: {x.size(0)}, {x.size(1)}, {x.size(2)}, {x.size(3)}" ) prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) logger.info( f"After resize, prediction: {prediction.size(0)}, {prediction.size(1)}, {prediction.size(2)}, {prediction.size(3)}, {prediction.size(4)}" ) # Get outputs x = torch.sigmoid(prediction[..., 0]) y = torch.sigmoid(prediction[..., 1]) w = prediction[..., 2] h = prediction[..., 3] pred_conf = torch.sigmoid(prediction[..., 4]) pred_cls = torch.sigmoid(prediction[..., 5:]) # if grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_bboxes = FloatTensor(prediction[..., :4].shape) pred_bboxes[..., 0] = x.data + self.grid_x pred_bboxes[..., 1] = y.data + self.grid_y # 乘scale过的anchor_w, anchor_h pred_bboxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_bboxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat(( pred_bboxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1) logger.info( f"YOLOLayer output: {output.size(0)}, {output.size(1)}, {output.size(2)}\n" ) if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_bboxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf loss) # 目标框使用 mse loss # 计算loss采用最原始的数值 loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse.loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse.loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) # 置信度使用 bce 交叉熵, 有无物体的交叉熵比例贡献不一样 loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj # 分类交叉熵 loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) # 总体损失 坐标损失,置信度损失,分类损失 total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics # cls_acc 不理解??? cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() # detected_mask ??? detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size( 0) # 三个路径x分别为(N, 255, 13, 13),(N, 255, 26, 26),(N, 255, 52, 52) grid_size = x.size(2) # print(x.shape) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size). permute( 0, 1, 3, 4, 2 ) # 交换维度后(N, num_anchors(3), grid_size, grid_size, num_classes + 5(85)) .contiguous() # 返回一个内存连续的有相同数据的tensor,如果原tensor内存连续则返回原tensor ) # (N, 3, 13, 13, 85) # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: # iou_scores:标签中有物体的位置地方预测的框与真实的框的IOU # class_mask:标签中有物体的位置地方预测的物体的分类正确率 # obj_mask:指标签中有物体的网格中且与真实框IOU最大的框 # noobj_mask:指标签中有物体的网格中且与真实框IOU最大的框和IOU大于0.5之外的框 # tx, ty, tw, th:标签中检测物体的中心坐标和长宽 # tcls:类别的标签 # tconf:obj_mask.float() iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() # 类别分类的正确率 conf_obj = pred_conf[obj_mask].mean() # 标签中有物体的网格所在位置,预测是否有物体的置信度 conf_noobj = pred_conf[noobj_mask].mean( ) # 标签中没有物体的网格所在位置,预测是否有物体的置信度 conf50 = (pred_conf > 0.5).float() # 预测是否有物体的置信度大于0.5的框 iou50 = (iou_scores > 0.5).float() # 预测的框与真正的框的IOU值>0.5的框 iou75 = (iou_scores > 0.75).float() # 预测的框与真正的框的IOU值>0.7的框 detected_mask = conf50 * class_mask * tconf # 检测到物体和分类总的正确率 precision = torch.sum(iou50 * detected_mask) / ( conf50.sum() + 1e-16) # 精准度 recall50 = torch.sum(iou50 * detected_mask) / ( obj_mask.sum() + 1e-16) # IOU为0.5的召回率 recall75 = torch.sum(iou75 * detected_mask) / ( obj_mask.sum() + 1e-16) # IOU为0.75的召回率 self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None): print("YOLO layers forward>>") # x 依次有三种大小 # 255*13*13 # 255*26*26 # 255*52*52 nA = self.num_anchors # anchors 大小 3 nB = x.size(0) nG = x.size(2) stride = self.image_dim / nG # 图像对应的stride 416/13=32 ,416/26=16 416/52=8 # print("x shape:",x.shape) # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor # 为生成的每一个features map 像素产生一个 anchors*(5+classes) 的预测值 prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous() # Get outputs # 对预测的得到的 x 的中心坐标x,y进行 sigmod,因为下面要计算对每一个sigmod的偏移值, # 所以限定在0,1 之间。 长和宽不进行操作 x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height # 预测框的背景和前景的概率 进行sigmod操作 pred_conf = torch.sigmoid(prediction[..., 4]) # Conf #类别的概率 进行sigmod 操作 pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid # 计算每一个grid的偏移 # nG 取值 13 26 52 grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor) grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor) # print("grid_x:",grid_x) # print("grid_y:",grid_y) # 对实际大小的anchors ,缩小到指定的feature map 上面,得到缩放之后的anchors scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]) anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1)) anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1)) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() self.ce_loss = self.ce_loss.cuda() nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets( pred_boxes=pred_boxes.cpu().data, pred_conf=pred_conf.cpu().data, pred_cls=pred_cls.cpu().data, target=targets.cpu().data, anchors=scaled_anchors.cpu().data, num_anchors=nA, num_classes=self.num_classes, grid_size=nG, ignore_thres=self.ignore_thres, img_dim=self.image_dim, ) nProposals = int((pred_conf > 0.5).sum().item()) recall = float(nCorrect / nGT) if nGT else 1 precision = float(nCorrect / nProposals) # Handle masks mask = Variable(mask.type(ByteTensor)) conf_mask = Variable(conf_mask.type(ByteTensor)) # Handle target variables tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls.type(LongTensor), requires_grad=False) # Get conf mask where gt and where there is no gt conf_mask_true = mask conf_mask_false = conf_mask - mask # Mask outputs to ignore non-existing objects loss_x = self.mse_loss(x[mask], tx[mask]) loss_y = self.mse_loss(y[mask], ty[mask]) loss_w = self.mse_loss(w[mask], tw[mask]) loss_h = self.mse_loss(h[mask], th[mask]) loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss( pred_conf[conf_mask_true], tconf[conf_mask_true] ) loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1)) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return ( loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), recall, precision, ) else: # If not in training phase return predictions output = torch.cat( ( pred_boxes.view(nB, -1, 4) * stride, pred_conf.view(nB, -1, 1), pred_cls.view(nB, -1, self.num_classes), ), -1, ) return output
def forward(self, x, target = None, img_dim = None): FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) # todo 这个size为什么是输入的宽高维度呢 prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) .premute(0,1,3,4,2) # todo .contiguous() # todo ) # get output x = torch.sigmoid(prediction[..., 0]) y = torch.sigmoid(prediction[..., 1]) w = prediction[..., 2] h = prediction[..., 3] pred_conf = torch.sigmoid(prediction[..., 4]) pred_cls = torch.sigmoid(prediction[..., 5]) # if the grid size dose not match current we compute new offset if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if target is None: return output, 0 else: iou_score, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes = pred_boxes, pred_cls = pred_cls, target = targets, anchors=self.scaled_anchors, ignore_thres= self.ignore_thres. ) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tconf[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_score > 0.5 ).float() iou75 = (iou_score > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-15) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None): FloatTensor = torch.cuda.FloatTensor LongTensor = torch.cuda.LongTensor ByteTensor = torch.cuda.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) # convert predictions # note: NCHW format -> grid_y, grid_x # nx255x13x13 -> nx3x85x13x13 -> nx3x13x13x85 # 85: tx_ctr, ty_ctr, tw, th, objectness, 80 class prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) # get and parse outputs x = torch.sigmoid(prediction[..., 0]) # tx_ctr range: (0, 1) # format: [batch_size, anchors, grid_y, grid_x] y = torch.sigmoid(prediction[..., 1]) # ty_ctr range: (0, 1) w = prediction[..., 2] # tw h = prediction[..., 3] # th pred_conf = torch.sigmoid(prediction[..., 4]) # objectness use sigmoid() pred_cls = torch.sigmoid(prediction[..., 5:]) # cls use sigmoid() # format: [batch_size, anchors, grid_y, grid_x, cls] if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x # x_ctr range: (0, 13) pred_boxes[..., 1] = y.data + self.grid_y # y_ctr range: (0, 13) pred_boxes[..., 2] = torch.exp( w.data ) * self.anchor_w # width w.r.t current feature map dimension pred_boxes[..., 3] = torch.exp( h.data ) * self.anchor_h # height w.r.t current feature map dimension # output shape: [1, x, 85] output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, # get (x_ctr, y_ctr, w, h) w.r.t 416x416 pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: # calculate loss # (tx, ty, tw, th): target offset iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, # normalize x_ctr, y_ctr, w, h pred_cls=pred_cls, target=targets, anchors=self. scaled_anchors, # normalize (anchor w, anchor h) w.r. current yolo layer dimension ignore_thres=self.ignore_thres, # 0.5 ) """ test code """ tmp = list(obj_mask.size()) sum = 1 for item in tmp: sum *= item #print ('sum anchors: ', sum) #print ('positive samples: ', list(obj_mask[obj_mask].size())[0]) #print ('negative sample: %d \n' %(list(noobj_mask[noobj_mask].size())[0])) # calculate loss #print ('loss') """ calculate postive samples loss: loc loss + cls loss + obj loss """ # calculate loc loss loss_x = self.mse_loss( x[obj_mask], tx[obj_mask]) # choose positive predict box tx ang target tx* # x size: [batch_size, anchors, grid_y, grid_x] # obj_mask size: [batch_size, anchors, grid_y, grid_x] # tx size: [batch_size, anchors, grid_y, grid_x] # x[obj_mask] size: [14] 14 is number of positive samples loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) # calculate cls loss loss_cls = self.bce_loss( pred_cls[obj_mask], tcls[obj_mask]) # pred_cls size: [1, 3, 13, 13, 80] # obj_mask size: [1, 3, 13, 13] # pred_cls[obj_mask] size: [n, 80] # tcls[obj_mask] size: [n, 80] # loss_cls: 1/N * Sum(-(y x logp + (1-y) x log(1-p))) # calculate obj loss loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) # tconf = obj_mask # tconf[obj_mask]: [1, 1, 1, 1, 1 ...] note: just choose 1(target) # pred_conf[obj_mask]: [0.1, 0.12, 0.13 ...] # use binary cross-entropy loss """ calculate negative samples loss: no obj loss """ # calculate no-obj loss loss_conf_noobj = self.bce_loss( pred_conf[noobj_mask], tconf[noobj_mask]) # tconf = obj_mask # obj_mask[noobj_mask]: just choose 0(target) """ loss post-process """ loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj # note: it is unreasonable total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # metrics cls_acc = 100 * class_mask[obj_mask].mean( ) # class_mask[obj_mask] size: [20] 20 is positive samples number conf_obj = pred_conf[obj_mask].mean( ) # pred_conf[obj_mask] size: [20] 20 is positve samples number conf_noobj = pred_conf[noobj_mask].mean( ) # pred_conf[noobj_mask] size: [2000] 2000 is negative samples number conf50 = (pred_conf > 0.5).float() # size: [1, 3, 13, 13] iou50 = (iou_scores > 0.5).float() # size: [1, 3, 13, 13] iou75 = (iou_scores > 0.5).float() # size: [1, 3, 13, 13] detected_mask = conf50 * class_mask * tconf # size: [1, 3, 13, 13] # objectness > 0.5 and predict class is correct precision = torch.sum(iou50 * detected_mask) / ( conf50.sum() + 1e-16) # precision = TP / (TP + FP) # TP: objectness > 0.5 && predict class correct && IOU > 0.5 # TP + FP: objectness > 0.5 recall50 = torch.sum(iou50 * detected_mask) / ( obj_mask.sum() + 1e-16) # recall = TP / (TP + FN) # TP: objectness > 0.5 && predict class correct && IOU > 0.5 # TP + FN : all positive samples(obj_mask) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) #print (grid_size, 'x', grid_size, '-loss: ', to_cpu(total_loss).item(), ' coord loss: ', # to_cpu(loss_x).item() + to_cpu(loss_y).item() + to_cpu(loss_w).item() + to_cpu(loss_h).item(), # ' conf loss: ', to_cpu(loss_conf).item(), ' cls loss: ', to_cpu(loss_cls).item()) self.metrics = { "grid_size": grid_size, "loss": to_cpu(total_loss).item(), "loss-tx": to_cpu(loss_x).item(), "loss-ty": to_cpu(loss_y).item(), "loss-tw": to_cpu(loss_w).item(), "loss-th": to_cpu(loss_h).item(), "loss-conf": to_cpu(loss_conf).item(), "loss-cls": to_cpu(loss_cls).item(), "loss-obj": to_cpu(loss_conf_obj).item(), "loss-noobj x scale": to_cpu(loss_conf_noobj * self.noobj_scale).item(), "loss-noobj": to_cpu(loss_conf_noobj).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), } #print (self.metrics) self.noobj_scale = 100000 return output, total_loss
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) # reshape input torch to num_samples * num_anchors * (num_classes + 4) * grid_size^2 # permute prediction torch into num_samples * num_anchors * grid_size^2 * (num_classes + 4) # modify # only 4 parameters to be learned, so num_classes+5 => num_classes + 4 prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 4, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) # Get outputs # modify # reduce one channel for height x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y d = prediction[..., 2] # diameter # h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 3]) # Conf pred_cls = torch.sigmoid(prediction[..., 4:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors # modify # only adjust d pred_boxes = FloatTensor(prediction[..., :3].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(d.data) * self.anchor_w # pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 3) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: #modify build_target function to calculate new IOU for circle and rectangle #here tw is used as td iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) # modify, loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) # pick loss_w as loss_d and stop using loss_h loss_d = self.mse_loss(d[obj_mask], tw[obj_mask]) # loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + 0.5 * loss_d + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "d": to_cpu(loss_d).item(), # "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) x = torch.sigmoid(prediction[..., 0]) y = torch.sigmoid(prediction[..., 1]) w = prediction[..., 2] h = prediction[..., 3] pred_conf = torch.sigmoid(prediction[..., 4]) pred_cls = torch.sigmoid(prediction[..., 5:]) # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) # Calculate BCE of objectness score of a bounding box loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) # Calculate BCE of no objectness score of a bounding box loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) # Scale and Sum above two LOSS loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj # Calculate BCE of multi-class predictions of a bounding box loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return output, total_loss
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 8, grid_size, grid_size) .permute(0, 1, 3, 4, 2) .contiguous() ) # Get outputs # (u, v) Projected points on image plane u = torch.sigmoid(prediction[..., 0]) v = torch.sigmoid(prediction[..., 1]) # Z in the 3D coordinates Z = prediction[..., 2] # (Qw + Qx * i + Qy * j + Qz * k) Quaternion Qw = prediction[..., 3] Qx = prediction[..., 4] Qy = prediction[..., 5] Qz = prediction[..., 6] pred_conf = torch.sigmoid(prediction[..., 7]) # Conf pred_cls = torch.sigmoid(prediction[..., 8:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_uvZQ = FloatTensor(prediction[..., :7].shape) pred_uvZQ[..., 0] = u.data + self.grid_x pred_uvZQ[..., 1] = v.data + self.grid_y pred_uvZQ[..., 2] = Z.data pred_uvZQ[..., 3] = torch.sigmoid(Qw.data) # * self.anchor_Qw pred_uvZQ[..., 4] = torch.tanh(Qx.data) # * self.anchor_Qx pred_uvZQ[..., 5] = torch.tanh(Qy.data) # * self.anchor_Qy pred_uvZQ[..., 6] = torch.tanh(Qz.data) # * self.anchor_Qz output = torch.cat( ( pred_uvZQ[..., :2].view(num_samples, -1, 2) * self.stride, pred_uvZQ[..., 2:].view(num_samples, -1, 5), pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: z_scores, class_mask, obj_mask, noobj_mask, tu, tv, tZ, tQw, tQx, tQy, tQz, tcls, tconf = build_targets( pred_uvZQ=pred_uvZQ, pred_cls=pred_cls, target=targets, anchors=self.anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_u = 10 * self.mse_loss(u[obj_mask], tu[obj_mask]) loss_v = 10 * self.mse_loss(v[obj_mask], tv[obj_mask]) loss_Z = 10 * self.mse_loss(Z[obj_mask], tZ[obj_mask]) loss_Qw = self.mse_loss(Qw[obj_mask], tQw[obj_mask]) loss_Qx = self.mse_loss(Qx[obj_mask], tQx[obj_mask]) loss_Qy = self.mse_loss(Qy[obj_mask], tQy[obj_mask]) loss_Qz = self.mse_loss(Qz[obj_mask], tQz[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_u + loss_v + loss_Z + loss_Qw + loss_Qx + loss_Qy + loss_Qz + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() z5 = (z_scores < 0.5).float() z05 = (z_scores < 0.05).float() detected_mask = conf50 * class_mask * tconf recall5 = torch.sum(z5 * detected_mask) / (obj_mask.sum() + 1e-16) recall05 = torch.sum(z05 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "u": to_cpu(loss_u).item(), "v": to_cpu(loss_v).item(), "Z": to_cpu(loss_Z).item(), "Qw": to_cpu(loss_Qw).item(), "Qx": to_cpu(loss_Qx).item(), "Qy": to_cpu(loss_Qy).item(), "Qz": to_cpu(loss_Qz).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall5": to_cpu(recall5).item(), "recall05": to_cpu(recall05).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None): print("^" * 30) print("yolo layer input: ", x.shape) print("targets: ", targets.shape) print("img_dim: ", img_dim) # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor #输入到模型时图片的尺寸 self.img_dim = img_dim num_samples = x.size(0) #特征图尺寸 grid_size = x.size(2) # 对x的操作 #(num_samples, 255, 13, 13)->(num_samples, 3, 80+5, 13, 13)->(num_samples, 3, 13, 13, 80+5) prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) # Get outputs # last dimension column 1 = tensor[...,0] x = torch.sigmoid(prediction[..., 0]) y = torch.sigmoid(prediction[..., 1]) w = prediction[..., 2] h = prediction[..., 3] pred_conf = torch.sigmoid(prediction[..., 4]) pred_cls = torch.sigmoid(prediction[..., 5:]) # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors # this part is related to the bounding box. ?????????????? pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp( w.data) * self.anchor_w # why exp? 这是yolo v3论文中的公式,用这个来做预测值 pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h # print(self.stride) output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, # why does it(x, y, w, h) mult self.stride pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) # 对targets的操作 if targets is None: return output, 0 else: #此函数用于将模型输入的target转化成用于计算loss的target,应当熟悉其逻辑,这也可能是目标检测算法对标签数据处理的通用逻辑 iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # ******************************** # 如果不类型转换,会报警告 obj_mask = obj_mask.bool() noobj_mask = noobj_mask.bool() # ******************************** # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda Support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) # x.size() => torch.Size([1, 3, 13, 13]) grid_size = x.size(2) # 13, 13 prediction = ( # ( )이렇게 감 쌈 그냥 가독성을 위한 것인듯... shape 모양에는 그대로임 x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2) # 내부의 차원의 배치를 바꿀 것이다. .contigous() # 메모리를 연속적으로 할당해준다. 이렇게 하믄 backend에서 효율적으로 동작한다는듯 ) # ( num_samples, self.num_anchors, grid_size, grid_size, self.num_classes + 5 ) # 만약 coco라면 => (1, 3, 13, 13, 85) # Get outputs x = torch.sigmoid(prediction[..., 0]) # => O(tx)이다. 즉, Sigmoid를 씌운 x 좌표 y = torch.sigmoid(prediction[..., 1]) # => O(ty)이다. 즉, Sigmoid를 씌운 y 좌표 w = prediction[..., 2] h = prediction[..., 3] pred_conf = torch.sigmoid(prediction[..., 4]) pred_cls = torch.sigmoid(prediction[..., 5:]) # if grid size does not match current we compute new offsets # 맨처음에는 grid_size가 0이니까 if 안으로 빠진다. if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors # x.data.shape -> [1, 3, 13, 13] 이다. # self.grid_x.shape -> [1, 1, 13, 13] 이다. pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w # pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h # output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None, Half=False): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor FloatTensor = torch.cuda.HalfTensor if x.type() == "torch.cuda.HalfTensor" else torch.cuda.FloatTensor # LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor # ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor # 注释说明 # x 是最后一层卷积输出的特征图,在输入图片大小为416×416的前提下 # x[0],x[1],x[2],x[3] = batch size, 255, 13, 13 # x[0],x[1],x[2],x[3] = batch size, 255, 26, 26 # 255 = 3*(4+1+80) 3:我认为是mask的数量,也即每个cell生成的检测框数; 4:检测框坐标; 1:检测框置信度;80:类别数。 # 检测框具体顺序为 Center x,Center y,Width,Height self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) # 注释说明 # prediction 的维度为 batch_size, num_anchors=3, grid_size, grid_size, num_classes + 5(coco:85) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) .permute(0, 1, 3, 4, 2) # permute: 将维度换位 .contiguous() ) # print(prediction.size()) # 注释说明 # Center x,Center y,Conf,Cls pred 用sigmoid函数限定其范围在0-1范围内 # 为什么 w,h 不用限定范围?确实存在 w,h 大于1的是数据 # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf (检测框置信度) pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # print(torch.max(w)) # print(h) # 调试 # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, img_dim, cuda=x.is_cuda, Half=Half) # 注释说明 # pred_box 表示网络预测的框 # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h # print(pred_boxes[..., 2].type()) output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) # print(output.size()) # 注释说明 # target 用来表明是否是训练还是推理 if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) # 注释说明 # loss_conf 正负样本带有各自权重(obj_scale,noobj_scale) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None): # print('hahaha',x.shape) # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor # 输入图像大小 self.img_dim = img_dim # N,C,H,W # 几个样本 num_samples = x.size(0) # 目前样本的尺寸 grid_size = x.size(2) # print('raw x shape {}'.format(x.shape)) # print('x view shape {}'.format((num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size))) ''' reshape一下, [num_samples,num_anchors,grid_size,grid_size,num_class+5] ''' prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) .permute(0, 1, 3, 4, 2) .contiguous() ) ''' 这个...表示取最里面那个num_class+5这个维度的 x,y是bbox相对于当前cell的偏移量 w,h是bbox的w,h相对于anchors(在当前feature_map下)的log值 ''' # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # print('heihei',pred_cls.shape) # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # print(self.grid_x) # print(self.grid_y) ''' 将tx,ty,tw,th恢复成bbox的坐标 ''' # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: ''' 这个targets,是一个【n,6】的张量 [第几张图,0,cx,cy,dw,dh] obj_mask包含的是和anchors的IOU最大的一批数据 noobj_mask包含的是除去IOU超过阈值的一批数据 ''' import time # print(pred_boxes.shape) # print(pred_cls.shape) # print(targets.shape) # # print('stop here') # time.sleep(1000) iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) obj_mask = obj_mask.bool() # convert int8 to bool noobj_mask = noobj_mask.bool() # convert int8 to bool # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) ''' loss由三部分组成: 1、(有物体在的cell && 被选中的anchors)对应的tx,ty,tw,th误差 2、(有物体在的cell && 被选中的anchors)对应的前背景分类误差 3、(没物体在的cell && 被选中的anchors)对应的前背景分类误差 4、(有物体在的cell && 被选中的anchors)对应的类别分类误差 ''' # 第一部分 loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) # 第二部分 loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) # 第三部分 loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) # 按照不同比例组合 loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj # 第四部分 loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None): # 每个格子的anchor个数(现在是3) nA = self.num_anchors # 一个batch的图片数量 nB = x.size(0) # 传入yolo层特征图宽高(这里宽高都是13,所以取一个值即可) nG = x.size(2) # 网络的步长,即输入网络图片的尺寸与最后输出的特征图的尺寸比值 stride = self.image_dim / nG # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor # 将2x255x13x13先view成2x3x85x13x13再permute(重排列的index)成2x3x13x13x85, #最后的85对应每个anchor预测出来的属性(tx,ty,tw,th,score,score_class1,score_class2...score_class80) # 其中tx,ty是相对于该anchor所在cell左上角的偏移坐标,代表预测出来的anchor中心坐标 prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous() #contiguous:view只能用在contiguous的variable上。如果在view之前用了transpose, permute等,需要用contiguous()来返回一个contiguous copy。 # Get outputs x = torch.sigmoid( prediction[..., 0]) # Center x,对应于预测坐标公式中的sigmoid(tx),维度为2x3x13x13 y = torch.sigmoid(prediction[..., 1]) # Center y,对应于预测坐标公式中的sigmoid(ty) w = prediction[..., 2] # Width,对应于预测坐标公式中的tw h = prediction[..., 3] # Height,对应于预测坐标公式中的th pred_conf = torch.sigmoid(prediction[..., 4]) # Conf 预测方框内含有目标的得分 pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. 方框内所含目标属于每个类的概率得分 # Calculate offsets for each grid # 生成所有cell的Cx坐标,一共有13x13个cell,所以x坐标有13x13个,范围从0到12。torch.arange(nG)先生成一个长度为13的行tensor # 再用repeat(nG,1)扩展成维度为13x13的tensor,最后用view()变成1x1x13x13的tensor,并且将类型转换成float型 grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor) grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor) # 得到所有经过缩小后的anchor尺寸,scaled_anchors维度为3x2,一行对应一个缩小后anchor的宽高。此时anchor的尺寸是相对于特征图 # 特征图尺寸是原图缩放网络步长stride倍,同理anchor也缩小stride倍 scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]) # 得到所有缩放后anchor的宽,nA为每个cell对应的anchor的个数 anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1)) # 得到所有缩放后anchor的高,nA为每个cell对应的anchor的个数 anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1)) # Add offset and scale with anchors,对anchor进行平移和尺度缩放,得到预测的方框宽高 # pred_boxes维度为2x3x13x13x4,是所有anchors预测出来的tx,ty,tw,th pred_boxes = FloatTensor(prediction[..., :4].shape) # 对应公式 bx = sigmoid(tx) + cx,x维度为2x3x13x13,grid_x维度为1x1x13x13,相加时会根据python广播原理,扩展成2x3x13x13 pred_boxes[..., 0] = x.data + grid_x # 对应公式 by = sigmoid(ty) + cy pred_boxes[..., 1] = y.data + grid_y # 对应公式 bw = pw*e^(tw),pw对应anchor_w,是anchor缩小stride倍后的宽;tw对应w.data pred_boxes[..., 2] = torch.exp(w.data) * anchor_w # 对应公式 bh = ph*e^(th),ph对应anchor_h,是anchor缩小stride倍后的高;th对应h.data pred_boxes[..., 3] = torch.exp(h.data) * anchor_h # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() self.ce_loss = self.ce_loss.cuda() # 注释见util.py的build_targets函数 nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets( pred_boxes=pred_boxes.cpu().data, pred_conf=pred_conf.cpu().data, pred_cls=pred_cls.cpu().data, target=targets.cpu().data, anchors=scaled_anchors.cpu().data, num_anchors=nA, num_classes=self.num_classes, grid_size=nG, ignore_thres=self.ignore_thres, img_dim=self.image_dim, ) # nProposals为预测方框中,含有目标得分大于0.5的方框个数,即网络预测出来的方框。item()对只有一个元素的tensor进行操作,返回一个python数字 nProposals = int((pred_conf > 0.5).sum().item()) # 计算recall和precision recall = float(nCorrect / nGT) if nGT else 1 precision = float(nCorrect / nProposals) # Handle masks mask = Variable(mask.type(ByteTensor)) conf_mask = Variable(conf_mask.type(ByteTensor)) # Handle target variables tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls.type(LongTensor), requires_grad=False) # Get conf mask where gt and where there is no gt # conf_mask_true标记真正负责检测目标的anchor的位置 conf_mask_true = mask # conf_mask_false标记没有负责检测目标的anchor的位置 conf_mask_false = conf_mask - mask # Mask outputs to ignore non-existing objects # 采用均方误差计算x,y,w,h的偏移量和缩放比例的预测误差 # ?????????? x[mask]是如何取值的? 是采用的数组索引方式进行索引,与numpy数组索引类似,但是有差异。这里实际是bool索引 # 这里mask和x都是2x3x13x13的tensor,利用数组索引的方式提取x中的元素,被提取出来的元素就是mask中非0元素所在位置在x中对 # 应位置的元素。所以x[mask]就将真正负责检测目标的anchor所对应的预测方框中心坐标在x方向上的偏移量的预测值提取出来。tx就 # 是真实标签方框所对应的方框中心坐标在x方向上的偏移量。计算它们的平方误差即可。y,w,h同理 loss_x = self.mse_loss(x[mask], tx[mask]) loss_y = self.mse_loss(y[mask], ty[mask]) loss_w = self.mse_loss(w[mask], tw[mask]) loss_h = self.mse_loss(h[mask], th[mask]) # 计算每个anchor预测的含有目标的损失,采用Binary Cross Entropy损失函数 # pred_conf[conf_mask_false],长度为1005的一维tensor,提取出没有负责检测目标的anchor所预测的这个anchor含有目标的得分 # tconf[conf_mask_false],长度为1005的一维tensor,提取出没有负责检测目标的anchor所对应的真实目标标签,值为0 # pred_conf[conf_mask_true],长度为9的一维tensor,提取出真正负责检测目标的anchor所预测的这个anchor含有目标的得分 # tconf[conf_mask_true],长度为9的一维tensor,提取出真正负责检测目标的anchor所对应的真实目标标签,值为1 # 这里的conf_mask_false,conf_mask_true的维度与tconf的维度都是2x3x13x13,并且conf_mask_true和mask以及tconf的维度和 # 元素值都是相等的,等于1的元素代表这个位置对应的anchor负责检测一个目标。这里用了mask和tconf两个变量来记录,个人认为是 # 为了用tconf[conf_mask_true]这种方式很方便的提取出真实的标签 loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss( pred_conf[conf_mask_true], tconf[conf_mask_true]) # 计算真正负责检测一个目标的anchor所预测的类别的得分 # pred_cls:维度为2x3x13x13x80,是预测出来的每个anchor所含目标对应每个类别的概率,mask维度为2x3x13x13, # 所以pred_cls[mask]在前面的2x3x13x13的维度索引中采用的是bool值索引方式,只有mask中非0的元素在pred_cls中对应元素才会提取出来。 # 此时提取出来的元素会自动包含最后一个没有被mask给定的维度,所以pred_cls[mask]维度为9x80,是一个二维tensor. # 代表真正负责目标检测的ahchor所含目标对应每个类别的概率 # tcls:维度为2x3x13x13x80,mask维度为2x3x13x13,tcls[mask]维度为9x80,表示真实目标有9个,每个目标可能的类别有80个, # 只有一个元素为1,所以这80个元素中只有一个为1,用argmax得到了9x80 tensor中每行元素中最大的值对应的序号, # 也即类别所在序号,得到长度为9的一维tensor。 # 参考官方文档中的公式,这里采用的是交叉熵损失函数。pred_cls[mask]对应x,torch.argmax(tcls[mask], 1)对应每个目标的class, # 而公式中j对应的是每个目标可能的不同的类别,这里就是x的一行中不同列的下标。最终输出的是所有目标损失值 loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1)) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return ( loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), recall, precision, ) else: # If not in training phase return predictions output = torch.cat( ( pred_boxes.view(nB, -1, 4) * stride, pred_conf.view(nB, -1, 1), pred_cls.view(nB, -1, self.num_classes), ), -1, ) return output
def forward(self, x, targets=None, img_dim=None): # Tensores para soporte cuda FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) # obtener salidas x = torch.sigmoid(prediction[..., 0]) # centro de x y = torch.sigmoid(prediction[..., 1]) # centro de y w = prediction[..., 2] # ancho h = prediction[..., 3] # largo pred_conf = torch.sigmoid(prediction[..., 4]) # configuracion pred_cls = torch.sigmoid(prediction[..., 5:]) # predicciones # Si el tamaño de la cuadrícula no coincide con el actual, calculamos nuevas compensaciones if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Agregue desplazamiento y escala con anclajes pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Pérdida: enmascara las salidas para ignorar objetos no existentes (excepto con pérdida de configuración) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # metricas cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None): nA = self.num_anchors nB = x.size(0) nG = x.size(2) stride = self.image_dim / nG # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous() # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf #pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor) grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor) scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]) anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1)) anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1)) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() self.ce_loss = self.ce_loss.cuda() nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf = build_targets( pred_boxes=pred_boxes.cpu().data, pred_conf=pred_conf.cpu().data, #pred_cls=pred_cls.cpu().data, target=targets.cpu().data, anchors=scaled_anchors.cpu().data, num_anchors=nA, num_classes=self.num_classes, grid_size=nG, ignore_thres=self.ignore_thres, img_dim=self.image_dim, ) # Handle masks mask = Variable(mask.type(ByteTensor)) conf_mask = Variable(conf_mask.type(ByteTensor)) # Handle target variables tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) #tcls = Variable(tcls.type(LongTensor), requires_grad=False) # Get conf mask where gt and where there is no gt conf_mask_true = mask conf_mask_false = conf_mask - mask # Mask outputs to ignore non-existing objects loss_x = self.mse_loss(x[mask], tx[mask]) loss_y = self.mse_loss(y[mask], ty[mask]) loss_w = self.mse_loss(w[mask], tw[mask]) loss_h = self.mse_loss(h[mask], th[mask]) loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss( pred_conf[conf_mask_true], tconf[conf_mask_true] ) #loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1)) loss = loss_x + loss_y + loss_w + loss_h + loss_conf #+ loss_cls return ( loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item() ) else: # If not in training phase return predictions output = torch.cat( ( pred_boxes.view(nB, -1, 4) * stride, pred_conf.view(nB, -1, 1), #pred_cls.view(nB, -1, self.num_classes), ), -1, ) return output
def forward(self, x, targets=None, img_dim=None): # x.shape: b x 255 x 13 x 13 (anchor 6, 7, 8) # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) # batch size grid_size = x.size(2) # feature map size: 13, 26, 52 # initially, self.grid_size = 0 prediction = ( # b, 3, 85, 13, 13 x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) # b, 3, 13, 13, 85 .permute(0, 1, 3, 4, 2) .contiguous() ) # Get outputs # the x,y,w,h corresponds to the pink circle in slides (generated directly from network) x = torch.sigmoid(prediction[..., 0]) # Center x # (b,3,13,13) # 1 + y = torch.sigmoid(prediction[..., 1]) # Center y # (b,3,13,13) # 1 + w = prediction[..., 2] # Width # (b,3,13,13) # 1 + h = prediction[..., 3] # Height # (b,3,13,13) # 1 + pred_conf = torch.sigmoid(prediction[..., 4]) # Conf (b,3,13,13) # 1 + = 5 + pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. (b,3,13,13,80) # 80 = 85 # Initially, self.grid_size = 0 != 13, then 13 != 26, then 26 != 52 # Each time, if former grid size does not match current one, we need to compute new offsets # 作用: # 1. 针对不同size的feature map (13x13, 26x26, 52x52), 求出不同grid的左上角坐标 # 2. 将(0, 416)范围的anchor scale到(0, 13)的范围 # if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # self.grid_x: # self.grid_y: # tensor([[[[0,1,2,...,12], # tensor([[[[0,0,0,...,0], # [0,1,2,...,12], # [1,1,1,...,1], # ... # ... # [0,1,2,...,12]]]]) # [12,12,12,...,12]]]]) # shape=torch.Size([1, 1, 13, 13]) # shape=torch.Size([1, 1, 13, 13]) # # # self.anchor_w: shape([1, 3, 1, 1]) # self.anchor_h: shape([1, 3, 1, 1]) # tensor([ # tensor([ # [ # [ # [[3.625]], # [[2.8125]], # [[4.8750]], # [[6.1875]], # [[11.6562]] # [[10.1875]] # ] # ] # ]) # ]) # Add offset and scale with anchors # 请回想/对照slides中的等式,是目前绝大部分靠回归offset的方法通行的策略 # x, y, w, h即上文中prediction, 此部分是直接由网络predict出来的, xy经过sigmoid强制到(0,1) # grid_xy是grid的左上角坐标[0,1,...,12], # 所以xy+grid_xy就是将pred结果(即物体中心点)分布到每个grid中去,(0, 13) # # 对于wh,由于prediction的结果直接是log()后的(如果忘记,请回看slides),所以此处要exp # # 此时,所有pred_boxes都是(0,13)范围的 # These preds are final outpus for test/inference which corresponds to the blue circle in slides # This procedure could also be called as Decode # # 通常情况下,单纯的preds并不参与loss的计算,而只是作为最终的输出存在, # 但是这里依然计算,并在build_targets函数中出现,其目的,在于协助产生mask pred_boxes = FloatTensor(prediction[..., :4].shape) # (b, 3, 13, 13, 4) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( # * stride(=32对于13x13),目的是将(0, 13)的bbox恢复到(0, 416) pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: # iou_scores: [b, num_anchor, grid_size, grid_size] -> pred_boxes与ground_truth的IoU # class_mask: [b, num_anchor, grid_size, grid_size], 预测正确的class 为true # obj_mask : [b, num_anchor, grid_size, grid_size] -> 1: 一定是正样本落在的地方(b_id, anchor_id, i, j) # -> 0: 一定不是正样本落在的地方 # noobj_mask: [b, num_anchor, grid_size, grid_size] -> 1: 一定是负样本落在的地方 # -> 0: 不一定是正样本落在的地方,也可能是不参与计算 # 体现了ignore_thres的价值。>ignore的,都不参与计算 # 底下是,算出来的,要参与产生loss的真实target.(除了tcls) # The procedure to generate those t·, corresponding to the gray circle in slides, can be called as Encode # tx: [b, num_anchor, grid_size, grid_size] # ty: [b, num_anchor, grid_size, grid_size] # tw: [b, num_anchor, grid_size, grid_size] # th: [b, num_anchor, grid_size, grid_size] # tcls :[b, num_anchor, grid_size, grid_size, n_classes] # iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, # (b, 3, 13, 13, 4) pred_cls=pred_cls, # (b, 3, 13, 13, 80) target=targets, # (n_boxes, 6) [details in build_targets function] anchors=self.scaled_anchors, # (3, 2) 3个anchor,每个2维 ignore_thres=self.ignore_thres, # 0.5 (hard code in YOLOLayer self.init()) ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) # 可以看到,真正参与loss计算的,仍然是·与t·,即offset regress # Reg Loss loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) # Conf Loss # 因为这里conf选择的是bce_loss,因为对于noobj,基本都能预测对,所以loss_conf_noobj通常比较小 # 所以此时为了平衡,noobj_scale往往大于obj_scale, (100, 1) # 实际上,这里的conf loss就是做了个0-1分类,0就是noobj, 1就是obj loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj # Class Loss loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) # Total Loss total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() # class_mask/obj_mask(b, 3, 13, 13) # 正确率 conf_obj = pred_conf[obj_mask].mean() # 有物体的平均置信度 conf_noobj = pred_conf[noobj_mask].mean() # 无物体的平均置信度 conf50 = (pred_conf > 0.5).float() # 置信度大于0.5的位置 (b, num_anchor, 13, 13) iou50 = (iou_scores > 0.5).float() # iou大于0.5的位置 (b, num_anchor, 13, 13) iou75 = (iou_scores > 0.75).float() # iou大于0.75的位置 (b, num_anchor, 13, 13) detected_mask = conf50 * class_mask * tconf # tconf=obj_mask, 即:既是预测的置信度>0.5,又class也对,又是obj precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None): bs = x.size(0) g_dim = x.size(2) stride = self.img_dim / g_dim # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor prediction = x.view(bs, self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous() # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid grid_x = torch.linspace(0, g_dim - 1, g_dim).repeat(g_dim, 1).repeat( bs * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) grid_y = torch.linspace(0, g_dim - 1, g_dim).repeat(g_dim, 1).t().repeat( bs * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors] anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim * g_dim).view(w.shape) anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim * g_dim).view(h.shape) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets( pred_boxes.cpu().data, targets.cpu().data, scaled_anchors, self.num_anchors, self.num_classes, g_dim, self.ignore_thres, self.img_dim) nProposals = int((conf > 0.25).sum().item()) recall = float(nCorrect / nGT) if nGT else 1 # Handle masks mask = Variable(mask.type(FloatTensor)) cls_mask = Variable( mask.unsqueeze(-1).repeat(1, 1, 1, 1, self.num_classes).type(FloatTensor)) conf_mask = Variable(conf_mask.type(FloatTensor)) # Handle target variables tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls.type(FloatTensor), requires_grad=False) # Mask outputs to ignore non-existing objects loss_x = self.lambda_coord * self.bce_loss(x * mask, tx * mask) loss_y = self.lambda_coord * self.bce_loss(y * mask, ty * mask) loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2 loss_h = self.lambda_coord * self.mse_loss(h * mask, th * mask) / 2 loss_conf = self.bce_loss(conf * conf_mask, tconf * conf_mask) loss_cls = self.bce_loss(pred_cls * cls_mask, tcls * cls_mask) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return loss, loss_x.item(), loss_y.item(), loss_w.item( ), loss_h.item(), loss_conf.item(), loss_cls.item(), recall else: # If not in training phase return predictions output = torch.cat( (pred_boxes.view(bs, -1, 4) * stride, conf.view( bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1) return output.data
def forward(self, p, targets=None, batch_report=False, var=None): FT = torch.cuda.FloatTensor if p.is_cuda else torch.FloatTensor bs = p.shape[0] # batch size nG = p.shape[2] # number of grid points stride = self.img_dim / nG if p.is_cuda and not self.grid_x.is_cuda: self.grid_x, self.grid_y = self.grid_x.cuda(), self.grid_y.cuda() self.anchor_w, self.anchor_h = self.anchor_w.cuda( ), self.anchor_h.cuda() self.weights, self.loss_means = self.weights.cuda( ), self.loss_means.cuda() # p.view(12, 255, 13, 13) -- > (12, 3, 13, 13, 80) # (bs, anchors, grid, grid, classes + xywh) p = p.view(bs, self.nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous() # prediction # Get outputs x = torch.sigmoid(p[..., 0]) # Center x y = torch.sigmoid(p[..., 1]) # Center y # Width and height (yolo method) w = p[..., 2] # Width h = p[..., 3] # Height width = torch.exp(w.data) * self.anchor_w height = torch.exp(h.data) * self.anchor_h # Width and height (power method) # w = torch.sigmoid(p[..., 2]) # Width # h = torch.sigmoid(p[..., 3]) # Height # width = ((w.data * 2) ** 2) * self.anchor_w # height = ((h.data * 2) ** 2) * self.anchor_h # Add offset and scale with anchors (in grid space, i.e. 0-13) pred_boxes = FT(bs, self.nA, nG, nG, 4) pred_conf = p[..., 4] # Conf pred_cls = p[..., 5:] # Class # Training if targets is not None: MSELoss = nn.MSELoss() BCEWithLogitsLoss = nn.BCEWithLogitsLoss() CrossEntropyLoss = nn.CrossEntropyLoss() if batch_report: gx = self.grid_x[:, :, :nG, :nG] gy = self.grid_y[:, :, :nG, :nG] pred_boxes[..., 0] = x.data + gx - width / 2 pred_boxes[..., 1] = y.data + gy - height / 2 pred_boxes[..., 2] = x.data + gx + width / 2 pred_boxes[..., 3] = y.data + gy + height / 2 tx, ty, tw, th, mask, tcls, TP, FP, FN, TC = \ utils.build_targets(pred_boxes, pred_conf, pred_cls, targets, self.scaled_anchors, self.nA, self.nC, nG, batch_report) tcls = tcls[mask] if x.is_cuda: tx, ty, tw, th, mask, tcls = tx.cuda(), ty.cuda(), tw.cuda( ), th.cuda(), mask.cuda(), tcls.cuda() # Compute losses nT = sum([len(x) for x in targets]) # number of targets nM = mask.sum().float() # number of anchors (assigned to targets) # print("mask:-----------",nM) nB = len(targets) # batch size k = nM / nB if nM > 0: lx = k * MSELoss(x[mask], tx[mask]) ly = k * MSELoss(y[mask], ty[mask]) lw = k * MSELoss(w[mask], tw[mask]) lh = k * MSELoss(h[mask], th[mask]) # self.tx.extend(tx[mask].data.numpy()) # self.ty.extend(ty[mask].data.numpy()) # self.tw.extend(tw[mask].data.numpy()) # self.th.extend(th[mask].data.numpy()) # print([np.mean(self.tx), np.std(self.tx)],[np.mean(self.ty), np.std(self.ty)],[np.mean(self.tw), np.std(self.tw)],[np.mean(self.th), np.std(self.th)]) # [0.5040668, 0.2885492] [0.51384246, 0.28328574] [-0.4754091, 0.57951087] [-0.25998235, 0.44858757] # [0.50184494, 0.2858976] [0.51747805, 0.2896323] [0.12962963, 0.6263085] [-0.2722081, 0.61574113] # [0.5032071, 0.28825334] [0.5063132, 0.2808862] [0.21124361, 0.44760725] [0.35445485, 0.6427766] # import matplotlib.pyplot as plt # plt.hist(self.x) # lconf = k * BCEWithLogitsLoss(pred_conf[mask], mask[mask].float()) lcls = (k / 4) * CrossEntropyLoss(pred_cls[mask], torch.argmax(tcls, 1)) # lcls = (k * 10) * BCEWithLogitsLoss(pred_cls[mask], tcls.float()) else: lx, ly, lw, lh, lcls, lconf = FT([0]), FT([0]), FT([0]), FT( [0]), FT([0]), FT([0]) # lconf += k * BCEWithLogitsLoss(pred_conf[~mask], mask[~mask].float()) lconf = (k * 64) * BCEWithLogitsLoss(pred_conf, mask.float()) # Sum loss components balance_losses_flag = False if balance_losses_flag: k = 1 / self.loss_means.clone() loss = (lx * k[0] + ly * k[1] + lw * k[2] + lh * k[3] + lconf * k[4] + lcls * k[5]) / k.mean() self.loss_means = self.loss_means * 0.99 + \ FT([lx.data, ly.data, lw.data, lh.data, lconf.data, lcls.data]) * 0.01 else: loss = lx + ly + lw + lh + lconf + lcls # Sum False Positives from unassigned anchors FPe = torch.zeros(self.nC) if batch_report: i = torch.sigmoid(pred_conf[~mask]) > 0.5 if i.sum() > 0: FP_classes = torch.argmax(pred_cls[~mask][i], 1) FPe = torch.bincount( FP_classes, minlength=self.nC).float().cpu() # extra FPs return loss, loss.item(), lx.item(), ly.item(), lw.item(), lh.item(), lconf.item(), lcls.item(), \ nT, TP, FP, FPe, FN, TC else: pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = width pred_boxes[..., 3] = height # If not in training phase return predictions output = torch.cat( (pred_boxes.view(bs, -1, 4) * stride, torch.sigmoid(pred_conf.view( bs, -1, 1)), pred_cls.view(bs, -1, self.nC)), -1) return output.data
def yolo_loss(x, y, w, h, xdir, ydir, pred_boxes, pred_conf, pred_cls, targets, scaled_anchors, ignore_thres, clf_criterion, reg_criterion, obj_scale, noobj_scale, regr_weights, grid_size1): iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, txdir, tydir, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=scaled_anchors, ignore_thres=ignore_thres, ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = reg_criterion(x[obj_mask], tx[obj_mask]) loss_y = reg_criterion(y[obj_mask], ty[obj_mask]) loss_w = reg_criterion(w[obj_mask], tw[obj_mask]) loss_h = reg_criterion(h[obj_mask], th[obj_mask]) # Loss: rotations loss_xdir = reg_criterion(xdir[obj_mask], txdir[obj_mask]) loss_ydir = reg_criterion(ydir[obj_mask], tydir[obj_mask]) weights = (noobj_scale, obj_scale) loss_conf = focal_loss(pred_conf, tconf, weights) #print(obj_scale, '*', loss_conf_obj, '+', noobj_scale, '*', loss_conf_noobj ) if pred_cls is not None: loss_cls = clf_criterion(pred_cls[obj_mask], tcls[obj_mask]) else: loss_cls = torch.tensor(0, device=device) total_loss = regr_weights * (loss_x + loss_y + loss_w + loss_h + loss_xdir + loss_ydir) +\ loss_conf + loss_cls # Metrics if loss_cls == 0: cls_acc = torch.tensor(0, device=device) else: cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size1, 'rotation': to_cpu(loss_xdir + loss_ydir).item() } return total_loss, metrics
def forward(self, x, targets=None): batch_size = x.size(0) num_Grids = x.size(2) stride = self.image_dim / num_Grids # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda \ else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor # Output : Batch_Size * # Num_Anchors * # (4+1+Num_classes) * # Num_Grids * # Num_Grids prediction = x.view(batch_size, self.num_anchors, self.bbox_attrs, num_Grids, num_Grids).permute(0, 1, 3, 4, 2).contiguous() # Get individual outputs pred_x = torch.sigmoid(prediction[..., 0]) # Center x pred_y = torch.sigmoid(prediction[..., 1]) # Center y pred_w = prediction[..., 2] # Width pred_h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred # Calculate offsets for each grid grid_x = torch.arange(num_Grids).repeat(num_Grids, 1).view( [1, 1, num_Grids, num_Grids]).type(FloatTensor) grid_y = torch.arange(num_Grids).repeat(num_Grids, 1).t().view( [1, 1, num_Grids, num_Grids]).type(FloatTensor) scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]) anchor_w = scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1)) anchor_h = scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1)) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = grid_x + pred_x.data pred_boxes[..., 1] = grid_y + pred_y.data pred_boxes[..., 2] = anchor_w * torch.exp(pred_w.data) pred_boxes[..., 3] = anchor_h * torch.exp(pred_h.data) if targets is not None: # Training if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() self.ce_loss = self.ce_loss.cuda() nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = \ build_targets( pred_boxes=pred_boxes.cpu().data, pred_conf=pred_conf.cpu().data, pred_cls=pred_cls.cpu().data, target=targets.cpu().data, anchors=scaled_anchors.cpu().data, num_anchors=self.num_anchors, num_classes=self.num_classes, grid_size=num_Grids, ignore_thres=self.ignore_thres, img_dim=self.image_dim, ) nProposals = int((pred_conf > 0.5).sum().item()) recall = float(nCorrect / nGT) if nGT else 1 precision = float(nCorrect / nProposals) # Handle masks mask = Variable(mask.type(ByteTensor)) conf_mask = Variable(conf_mask.type(ByteTensor)) # Handle target variables tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls.type(LongTensor), requires_grad=False) # Get conf mask where gt and where there is no gt conf_mask_true = mask conf_mask_false = conf_mask - mask # Mask outputs to ignore non-existing objects loss_x = self.mse_loss(pred_x[mask], tx[mask]) loss_y = self.mse_loss(pred_y[mask], ty[mask]) loss_w = self.mse_loss(pred_w[mask], tw[mask]) loss_h = self.mse_loss(pred_h[mask], th[mask]) loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss( pred_conf[conf_mask_true], tconf[conf_mask_true]) loss_cls = (1 / batch_size) * self.ce_loss( pred_cls[mask], torch.argmax(tcls[mask], 1)) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return ( loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), recall, precision, ) else: # Inference output = torch.cat( (pred_boxes.view(batch_size, -1, 4) * stride, pred_conf.view(batch_size, -1, 1), pred_cls.view(batch_size, 1, self.num_classes)), -1) return output
def forward(self, x, targets=None, img_dim=None): # print('yolo input shape {}'.format(x.shape)) # [8, 255, 13, 13] # [8, 255, 26, 26] # [8, 255, 52, 52] # 255 = n_anchors*(5+n_classes) = 3*85 ''' anchors = [(116, 90), (156, 198), (373, 326)] num_classes = 80 yolo_layer = YOLOLayer(anchors, num_classes) grid_size = 13 yolo_layer.compute_grid_offsets(grid_size) x = torch.rand([8, 255, grid_size, grid_size]).cuda() yolo_layer.forward(x, targets=targets) num_samples=8 self = yolo_layer ''' # Tensors for cuda support, fixme # import pdb; pdb.set_trace() device_id = x.device.index FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor BoolTensor = torch.cuda.BoolTensor if x.is_cuda else torch.BoolTensor self.img_dim = img_dim num_samples = x.size(0) # 8 grid_size = x.size(2) # # 13 prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous() ) # bs, 3, 85, 13, 13 # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda, device_id=device_id) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape, device=device_id) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) # [bs, num_bb_by_each_grid_cell*grid_cell*grid_cell, num_classes] if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) # self.register_buffer('metrics', None) # fixme self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss, self.metrics
def forward(self, x, targets=None, img_dim=None): self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) .permute(0, 1, 3, 4, 2) .contiguous() ) # Get outputs [x, y, width, height, confidence, cls_p * 20] x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size) # Add offset and scale with anchors pred_boxes = pred_boxes = torch.zeros_like(prediction[..., :4], dtype=torch.float, device=self.device) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.img_stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: (iou_scores, class_mask, obj_mask, no_obj_mask, true_x, true_y, true_w, true_h, true_cls, true_conf) = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thr=self.ignore_thr ) print(x.size()) print(pred_cls.size()) print(true_cls.size()) print(pred_conf.size()) print(true_conf.size()) obj_mask = obj_mask.long() no_obj_mask = no_obj_mask.long() # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], true_x[obj_mask]) print("x", loss_x.detach().cpu().item()) loss_y = self.mse_loss(y[obj_mask], true_y[obj_mask]) print("y", loss_y.detach().cpu().item()) loss_w = self.mse_loss(w[obj_mask], true_w[obj_mask]) print("w", loss_w.detach().cpu().item()) loss_h = self.mse_loss(h[obj_mask], true_h[obj_mask]) print("h", loss_h.detach().cpu().item()) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], true_conf[obj_mask]) print("obj", loss_conf_obj.detach().cpu().item()) loss_conf_no_obj = self.bce_loss(pred_conf[no_obj_mask], true_conf[no_obj_mask]) print("no_obj", loss_conf_no_obj.detach().cpu().item()) loss_conf = self.obj_scale * loss_conf_obj + self.no_obj_scale * loss_conf_no_obj loss_cls = self.bce_loss(pred_cls[obj_mask], true_cls[obj_mask]) print("loss_cls", loss_cls.detach().cpu().item()) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls print(total_loss.detach().cpu().item()) # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_no_obj = pred_conf[no_obj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * true_conf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": total_loss.detach().cpu().item(), "x": loss_x.detach().cpu().item(), "y": loss_y.detach().cpu().item(), "w": loss_w.detach().cpu().item(), "h": loss_h.detach().cpu().item(), "conf": loss_conf.detach().cpu().item(), "cls": loss_cls.detach().cpu().item(), "cls_acc": cls_acc.detach().cpu().item(), "recall50": recall50.detach().cpu().item(), "recall75": recall75.detach().cpu().item(), "precision": precision.detach().cpu().item(), "conf_obj": conf_obj.detach().cpu().item(), "conf_no_obj": conf_no_obj.detach().cpu().item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None): nA = self.num_anchors nB = x.size(0) nG = x.size(2) stride = self.image_dim / nG # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous() # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor) grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor) scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]) anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1)) anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1)) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h output = torch.cat( (pred_boxes.view(nB, -1, 4) * stride, pred_conf.view( nB, -1, 1), pred_cls.view(nB, -1, self.num_classes)), -1, ) if targets is None: # Inference return output else: # Training if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() num_targets, num_correct, obj_mask, noobj_mask, tx, ty, tw, th, tconf, tcls = build_targets( pred_boxes=pred_boxes.data.cpu(), pred_conf=pred_conf.data.cpu(), pred_cls=pred_cls.data.cpu(), target=targets.data.cpu(), anchors=scaled_anchors.data.cpu(), num_anchors=nA, num_classes=self.num_classes, grid_size=nG, ignore_thres=self.ignore_thres, img_dim=self.image_dim, ) # Compute recall and precision num_proposals = (pred_conf > 0.5).sum().item() recall = num_correct / num_targets if num_targets else 1 precision = num_correct / num_proposals # Masks obj_mask = Variable(obj_mask.type(ByteTensor), requires_grad=False) noobj_mask = Variable(noobj_mask.type(ByteTensor), requires_grad=False) # Handle target variables tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls.type(FloatTensor), requires_grad=False) # Mask outputs to ignore (except for conf. loss) non-existing objects loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = loss_conf_obj + loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls cls_acc = (pred_cls[obj_mask].argmax(1) == tcls[obj_mask].argmax(1) ).float().mean().item() return ( output, loss, { "loss": loss.item(), "x": loss_x.item(), "y": loss_y.item(), "w": loss_w.item(), "h": loss_h.item(), "conf": loss_conf.item(), "cls": loss_cls.item(), "cls_acc": cls_acc, "recall": recall, "precision": precision, }, )
def forward(self, x, targets=None, img_dim=None): # 计算总损失 以及 预测结果outputs targets为真实边界框 用于计算ap recall等 # Tensors for cuda support # FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim # 图片尺寸 num_samples = x.size(0) # (img_batch) grid_size = x.size(2) # (feature_map_size) # x.shape = tensor([batch_size,num_anchors*(num_classes+5),grid_size,grid_size]) # (batch_size, 255, grid_size, grid_size) # x就是最终输出的预测结果 255 = (80 + 4 + 1)* 3 # 13*13*255 prediction = (x.view(num_samples, self.num_anchors, 5 + self.num_classes, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) # print prediction.shape (batch_size, num_anchors, grid_size, grid_size, 85) # Get outputs # 这里的prediction是初步的所有预测,在grid_size*grid_size个网格中,它表示每个网格都会有num_anchor(3)个anchor框 # x,y,w,h, pred_conf的shape都是一样的 (batch_size, num_anchor, gride_size, grid_size) x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf置信度 pred_cls = torch.sigmoid( prediction[..., 5:] ) # Cls pred. (batch_size, num_anchor, gride_size, grid_size, cls) # If grid size does not match current we compute new offsets # print grid_size, self.grid_size if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # print self.grid_x, self.grid_y, self.anchor_w, self.anchor_h # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) # 这里是创建一个同等shape的tensor # 针对每个网格的偏移量,每个网格的单位长度为1,而预测的中心点(x,y)是归一化的(0,1之间),所以可以直接相加 # 广播机制 pred_boxes[ ..., 0] = x.data + self.grid_x # (batch_size, 1, gride_size, gride_size) # pred_boxes.shape = tensor.size([1,3,13,13]) # 详细解析上一步是什么意思,首先看维度 x的维度13*13*1 什么意思 就是每个网格中都包含一个预测的x值 # 那么距离左上角的距离就是 第一个网格左上角就是整个的左上角所以 +0 以此类推 +1 +2 +3 ... pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w # # (1,3,1,1) pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h # anchor_w 是预先设定的anchor尺寸 w.data是预测的边界框的宽 # 0 , 1 是指预测的中心点相对于图片左上角的偏移量 # pred_boxes.shape = tensor.size([batch_size, num_anchors,grid_size,grid_size, 4]) output = torch.cat( ( # (batch_size, num_anchors*grid_size*grid_size, 4) pred_boxes.view(num_samples, -1, 4) * self.stride, # 放大到最初输入的尺寸 # (batch_size, num_anchors*grid_size*grid_size, 1) pred_conf.view(num_samples, -1, 1), # (batch_size, num_anchors*grid_size*grid_size, 80) pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) # output.shape = tensor.size([batch_size, num_anchors*grid_size*grid_size, 85]) if targets is None: # targets 是指ground truth return output, 0 # 计算loss else: # pred_boxes => (batch_size, anchor_num, gride, gride, 4) # pred_cls => (batch_size, anchor_num, gride, gride, 80) # targets => (num, 6) 6=>(batch_index, cls, center_x, center_y, widht, height) # scaled_anchors => (3, 2) # print pred_boxes.shape, pred_cls.shape, targets.shape, self.scaled_anchors.shape iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # # iou_scores:预测框pred_boxes中的正确框与目标实体框target_boxes的交集IOU,以IOU作为分数,IOU越大,分值越高. # class_mask:将预测正确的标记为1(正确的预测了实体中心点所在的网格坐标,哪个anchor框可以最匹配实体,以及实体的类别) # obj_mask:将目标实体框所对应的anchor标记为1,目标实体框所对应的anchor与实体一一对应的 # noobj_mask:将所有与目标实体框IOU小于某一阈值的anchor标记为1 # tx, ty, tw, th: 需要拟合目标实体框的坐标和尺寸 # tcls:目标实体框的所属类别 # tconf:所有anchor的目标置信度 # 这里计算得到的iou_scores,class_mask,obj_mask,noobj_mask,tx, ty, tw, th和tconf都是(batch, anchor_num, gride, gride) # 预测的x,y,w,h,pred_conf也都是(batch, anchor_num, gride, gride) # tcls 和 pred_cls 都是(batch, anchor_num, gride, gride,num_class) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) # 坐标和尺寸的loss计算: loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) # anchor置信度的loss计算: loss_conf_obj = self.bce_loss( pred_conf[obj_mask], tconf[obj_mask]) # tconf[obj_mask] 全为1 loss_conf_noobj = self.bce_loss( pred_conf[noobj_mask], tconf[noobj_mask]) # tconf[noobj_mask] 全为0 loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj # 类别的loss计算 loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) # loss汇总 total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics 指标 cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf obj_mask = obj_mask.float() # print type(iou50), type(detected_mask), type(conf50.sum()), type(iou75), type(obj_mask) # # print iou50.dtype, detected_mask.dtype, conf50.sum().dtype, iou75.dtype, obj_mask.dtype precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def forward(self, x, targets=None, img_dim=None, cls=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous()) # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf # Softmax instead of sigmoid, since only one class will be present pred_cls = prediction[..., 5:] # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) # Weight the grid-wise predictions acc. to the object confidence weighted_class_scores = pred_conf.unsqueeze(dim=-1) * pred_cls weighted_class_scores = weighted_class_scores.sum(dim=(1, 2, 3)) if targets is None: return output, weighted_class_scores, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Calculate these only if obj_mask is non-empty if obj_mask.sum() > 0: # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.ce_loss( pred_cls[obj_mask].view(-1, self.num_classes), tcls[obj_mask].long().view(-1)) detection_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls else: detection_loss = 0. # Classification loss classification_loss = self.ce_loss(weighted_class_scores, cls) total_loss = detection_loss + classification_loss # Calculate these only if obj_mask is non-empty if obj_mask.sum() > 0: # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum( iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum( iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum( iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } _classification_loss = classification_loss.clone() self.metrics['classification_loss'] = to_cpu( _classification_loss).item() _weighted_class_scores = weighted_class_scores.clone() self.metrics['batch_acc'] = to_cpu( torch.sum(torch.argmax(_weighted_class_scores, dim=-1) == cls)).item() / len(cls) return output, weighted_class_scores, total_loss
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + self.num_angles + 5, grid_size, grid_size) .permute(0, 1, 3, 4, 2) .contiguous() ) # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height #a = torch.remainder((prediction[..., 4]*180/np.pi) + 180, 180)*np.pi/180 # Angle pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:5+self.num_classes]) # Cls pred. pred_angle_cls = torch.sigmoid(prediction[..., 5+self.num_classes: ]) # Angle Cls pred # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h #pred_boxes[..., 4] = a.data #print("Theta predictions: ", pred_boxes.view(num_samples, -1, 4).size(),pred_boxes[...,4].size(),pred_conf.size()) #print("Target SIZE: ",targets.size()) #print("pred boxes: ",pred_boxes[...,:4]) output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, #pred_boxes[...,4].view(num_samples,-1,1), pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), pred_angle_cls.view(num_samples, -1, self.num_angles) ), -1, ) #print(pred_boxes, targets) if targets is None: return output, 0 else: iou_scores, class_mask,angle_mask, obj_mask, noobj_mask, tx, ty, tw, th,tacls, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, pred_angle_cls = pred_angle_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) weights = torch.tensor([1.12424274,13.3361754, 75.7716263, 50.10983982, 61.6845070, 71.0974026, 73.73063973, 22.52880658 , 8.14052045, 5.87707998, 25.49243306, 10.36837121, 26.4468599, 77.92882562, 100.44954128, 82.9469697, 35.20578778, 8.97826978, 1.]).type(FloatTensor) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_angle = nn.BCELoss(reduction='none')(pred_angle_cls[obj_mask],tacls[obj_mask]) loss_conf_angle = loss_conf_angle*weights/100 loss_conf_angle = loss_conf_angle.mean() #loss_conf_angle = self.bce_loss(pred_angle_cls[obj_mask],tacls[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls + loss_conf_angle # Metrics cls_acc = 100 * class_mask[obj_mask].mean() angle_acc = 100 * angle_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "angle_acc": to_cpu(angle_acc).item(), "angle":to_cpu(loss_conf_angle).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss