def imshow_postive_anchors(images, anchors, annotations): import matplotlib.pyplot as plt import cv2 batch_size = images.size()[0] for i in range(batch_size): image = images[i, :, :, :] anno = annotations[i, :, :] anno = anno[anno[:, 0] != -1] iou = cal_iou(anchors[:, :], anno[:, :-1]) iou_max, iou_max_ind = torch.max(iou, dim=1) pos_ind = torch.ge(iou_max, 0.5) pos_anchors = anchors[pos_ind, :] print('positive anchor number:', pos_anchors.size()) unnormalize = UnNormalizer() image = 255 * unnormalize(image) image = torch.clamp(image, min=0, max=255).data.numpy() image = np.transpose(image, (1, 2, 0)).astype(np.uint8) for x1, y1, x2, y2 in pos_anchors: x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) image = cv2.rectangle(image, (x1, y1), (x2, y2), (0,0,255), 1) image = image.get() print(image.shape) plt.figure() image = image[:,:,[2,1,0]] plt.imshow(image) plt.show()
def validate(self, image, y, name): # temp = tf.map_fn(lambda x : self.image_encoder(x, training = False), image) self.state = self.image_encoder(image[:, 0, :, :, :], training = False) # shape = [batch_size, 1024] used = [0] for _t in range(params.time_steps - 1): prob = self.actor(self.state).numpy() action = utils.choose_action(prob, used) used.append(action) append_state = self.image_encoder(image[:, action, :, :, :], training = False) self.state = tf.reduce_max(tf.stack([self.state, append_state], axis = 1), axis = 1) voxel = self.generator(self.state, training = False) voxel = utils.dicide_voxel(voxel) utils.save_voxel(voxel, '{}_pridict'.format(name)) utils.save_voxel(y, '{}_true'.format(name)) y = y[0] y = np.argmax(y, -1) voxel = np.argmax(voxel, -1) iou = utils.cal_iou(y, voxel) print('iou = ', iou)
def preprocess_gt_boxes(gt_boxes, grid_shapes): """ args: gt_boxes : [m][x1,y1,x2,y2,cls] np.array m is gt_box numbers ,x1y1 is left up coord,x2y2 is right bottom coord return: y_true:list of array , like [(52,52,3,(4+4+1)+cls),(26,26,3,(4+4+1)+cls),(13,13,3,(4+4+1)+cls)] 4+4+1 : 4 is tx ty tw th , second 4 is gt_centerx gt_centery gt_w gt_h , second 4 is for calc iou """ y_true = [ np.zeros( (grid_shapes[l][0], grid_shapes[l][1], len(config.yolo_layer_anchor[l]), 4 + 4 + 1 + config.num_classes), dtype='float32') for l in range(config.num_layers) ] orign_gt_boxes = np.copy(gt_boxes[:, 0:4]) orign_gt_boxes = utils.convert_boxes_to_origin(orign_gt_boxes) anchors = np.array(config.anchors) orign_anchors = np.zeros((anchors.shape[0], 4)) #print('anchors:',config.anchors) orign_anchors[:, 2:4] = anchors[:][:] orign_anchors = utils.convert_boxes_to_origin(orign_anchors) for id, ogt in enumerate(orign_gt_boxes): gt_box = gt_boxes[id] #1.gt_box 最大iou的anchor iou = utils.cal_iou(ogt, orign_anchors) best_anchor = anchors[np.argmax(iou, axis=-1)] #print('gt_box:',gt_box,'ogt_box:',ogt,'best_anchor:',best_anchor,'o_anchors',orign_anchors) #2.iou最大的anchor 属于哪一层第几个 lindx, aindx = config.yolo_anchor_layerIndex[tuple(best_anchor)] #3.gtbox在grid_shape中的位置和偏移量tx,ty grid_shape = config.stride[lindx] py, px, ty, tx = utils.cal_box_offset_pos(gt_box, grid_shape) #4.计算 gt_center_x gt_centery gt_w , gt_h gt_w, gt_h = (gt_box[2] - gt_box[0], gt_box[3] - gt_box[1]) gt_center_x = (px + tx) * grid_shape gt_center_y = (py + ty) * grid_shape assert gt_w > 0 and gt_h > 0, r'gt_box w,h <0' #4.计算tw,th anchor_w, anchor_h = best_anchor tw = np.log(gt_w / anchor_w) th = np.log(gt_h / anchor_h) cls = int(gt_box[-1]) aindx = int(aindx) lindx = int(lindx) y_true[lindx][py, px, aindx, 0:4] = (tx, ty, tw, th) y_true[lindx][py, px, aindx, 4:8] = (gt_center_x, gt_center_y, gt_w, gt_h) y_true[lindx][py, px, aindx, 8] = 1 y_true[lindx][py, px, aindx, 9 + cls] = 1 #print('gt_box;',gt_box,gt_center_x,gt_center_y,gt_w,gt_h) return y_true
def create_landmark(argument=True): # 是否对图像变换 # argument = True image_id = 0 ftxt = os.path.join(data_dir, 'trainImageList.txt') data = get_landmark(ftxt, data_dir) idx = 0 landmark_list = [] for (imgPath, box, landmarkGt) in tqdm(data): # if image_id > 3: # break #存储人脸图片和关键点 F_imgs = [] F_landmarks = [] img = cv2.imread(imgPath, cv2.COLOR_BGR2RGB) img_h, img_w, img_c = img.shape gt_box = np.array([box.left, box.top, box.right, box.bottom]) #人脸图片 f_face = img[box.top:box.bottom + 1, box.left:box.right + 1] #resize成网络输入大小 f_face = cv2.resize(f_face, (size, size)) landmark = np.zeros((5, 2)) for index, one in enumerate(landmarkGt): #关键点相对于左上坐标偏移量并归一化 rv = ((one[0] - gt_box[0]) / (gt_box[2] - gt_box[0]), (one[1] - gt_box[1]) / (gt_box[3] - gt_box[1])) landmark[index] = rv F_imgs.append(f_face) F_landmarks.append(landmark.reshape(10)) landmark = np.zeros((5, 2)) if argument: #对图像变换 idx = idx + 1 x1, y1, x2, y2 = gt_box gt_w = x2 - x1 + 1 gt_h = y2 - y1 + 1 #除去过小图像 if max(gt_w, gt_h) < 40 or x1 < 0 or y1 < 0: continue for i in range(10): #随机裁剪图像大小 box_size = npr.randint(int(min(gt_w, gt_h) * 0.8), np.ceil(1.25 * max(gt_w, gt_h))) #随机左上坐标偏移量 delta_x = npr.randint(-gt_w * 0.2, gt_w * 0.2) delta_y = npr.randint(-gt_h * 0.2, gt_h * 0.2) #计算左上坐标 nx1 = int(max(x1 + gt_w / 2 - box_size / 2 + delta_x, 0)) ny1 = int(max(y1 + gt_h / 2 - box_size / 2 + delta_y, 0)) nx2 = nx1 + box_size ny2 = ny1 + box_size #除去超过边界的 if nx2 > img_w or ny2 > img_h: continue #裁剪边框,图片 crop_box = np.array([nx1, ny1, nx2, ny2]) cropped_im = img[ny1:ny2 + 1, nx1:nx2 + 1, :] resized_im = cv2.resize(cropped_im, (size, size)) iou = cal_iou(crop_box, np.expand_dims(gt_box, 0)) #只保留pos图像 if iou > 0.65: F_imgs.append(resized_im) #关键点相对偏移 for index, one in enumerate(landmarkGt): rv = ((one[0] - nx1) / box_size, (one[1] - ny1) / box_size) landmark[index] = rv F_landmarks.append(landmark.reshape(10)) landmark = np.zeros((5, 2)) landmark_ = F_landmarks[-1].reshape(-1, 2) box = BBox([nx1, ny1, nx2, ny2]) #镜像 if random.choice([0, 1]) > 0: face_flipped, landmark_flipped = flip( resized_im, landmark_) face_flipped = cv2.resize(face_flipped, (size, size)) F_imgs.append(face_flipped) F_landmarks.append(landmark_flipped.reshape(10)) #逆时针翻转 if random.choice([0, 1]) > 0: face_rotated_by_alpha, landmark_rorated = rotate( img, box, box.reprojectLandmark(landmark_), 5) #关键点偏移 landmark_rorated = box.projectLandmark( landmark_rorated) face_rotated_by_alpha = cv2.resize( face_rotated_by_alpha, (size, size)) F_imgs.append(face_rotated_by_alpha) F_landmarks.append(landmark_rorated.reshape(10)) #左右翻转 face_flipped, landmark_flipped = flip( face_rotated_by_alpha, landmark_rorated) face_flipped = cv2.resize(face_flipped, (size, size)) F_imgs.append(face_flipped) F_landmarks.append(landmark_flipped.reshape(10)) #顺时针翻转 if random.choice([0, 1]) > 0: face_rotated_by_alpha, landmark_rorated = rotate( img, box, box.reprojectLandmark(landmark_), -5) #关键点偏移 landmark_rorated = box.projectLandmark( landmark_rorated) face_rotated_by_alpha = cv2.resize( face_rotated_by_alpha, (size, size)) F_imgs.append(face_rotated_by_alpha) F_landmarks.append(landmark_rorated.reshape(10)) #左右翻转 face_flipped, landmark_flipped = flip( face_rotated_by_alpha, landmark_rorated) face_flipped = cv2.resize(face_flipped, (size, size)) F_imgs.append(face_flipped) F_landmarks.append(landmark_flipped.reshape(10)) F_imgs, F_landmarks = np.asarray(F_imgs), np.asarray(F_landmarks) for i in range(len(F_imgs)): #剔除数据偏移量在[0,1]之间 if np.sum(np.where(F_landmarks[i] <= 0, 1, 0)) > 0: continue if np.sum(np.where(F_landmarks[i] >= 1, 1, 0)) > 0: continue landmark_list.append([F_imgs[i], F_landmarks[i]]) image_id += 1 print("landmark数量:", image_id) return landmark_list
def cal_loss(img, y_preds, ground_truths, categories): ''' :param img: 3D array like (W, H, C) :param y_preds: nd array like (S*S, B, (5+class_num)) :param ground_truths: json like object, { 'large_vehicle':[[(341, 292),...,(346, 457), 0], [(341, 292),...,(346, 457), 1]...], 'small_vehicle':[[(341, 292),...,(346, 457), 0], [(341, 292),...,(346, 457), 0]...] } :param categories: categories list like [small_vehicle, ...] :return: total losses ''' loss = 0 class_num = len(categories) W, H, C = img.shape S, _, B = int(np.sqrt(y_preds.shape[0])), int(np.sqrt( y_preds.shape[0])), int(y_preds.shape[1]) ### define parameters lambda_coord, lambda_noob lambda_coord, lambda_noob = 0.5, 0.5 ### 1.对预测的中心坐标做损失 ground_truths_centeroid_idx = confirm_cell_index( (W, H), S, cal_centeroid(ground_truths)) for k in ground_truths_centeroid_idx.keys(): idxs = ground_truths_centeroid_idx[k] for i, idx in enumerate(idxs): idx_x, idx_y = idx idx_flat = idx_x * S + idx_y res_bbox = y_preds[idx_flat, :, :] ground_truth_box = ground_truths[k][i] ious = [] boxes = res_bbox boxes = np.array(boxes).reshape((B, class_num + 5)) for box in boxes: x_pred, y_pred, w_pred, h_pred, conf, class_prob_array = box[ 0], box[1], box[2], box[3], box[4], box[5:] pred_position = decode_position(img, cell=idxs, x=x_pred, y=y_pred, w=w_pred, h=h_pred, S=S) iou = cal_iou(ground_truth_box, pred_position) ious.append(iou) confirm_iou_idx = np.argmax(ious) confirm_box = ious[confirm_iou_idx] confirm_box = np.array(confirm_box).reshape(-1, ) box_pts = decode_position(img, cell=idx, x=confirm_box[0], y=confirm_box[1], w=confirm_box[2], h=confirm_box[3], S=S) ground_truth_box_pts = ground_truth_box[0] ### 计算中心点和边界框loss centerAndBox_loss = cal_centeroid_loss(lambda_coord, ground_truth_box_pts, box_pts) ### 计算类别损失 assert k in categories k_idx = categories.index(k) true_prob_array = np.zeros((class_num, )) true_prob_array[k_idx] = 1 class_loss = calculateMSE(true_prob_array, class_prob_array) ### 计算置信度损失 ground_truth_box_pts_idx = [ confirm_cell_index((W, H), S, p) for p in np.array(ground_truth_box_pts) ] ground_truth_box_pts_idx_1d = [ twoD2oneD(pt, S) for pt in ground_truths_centeroid_idx ] confidence_true = np.zeros(shape=(S, S)) confidence_true[ground_truth_box_pts_idx_1d] = 1 confidence_pred = np.array(y_pred[:, confirm_iou_idx, 4]).reshape( (S, S)) confidence_loss = calculateMSE(confidence_true, confidence_pred) loss = centerAndBox_loss + class_loss + confidence_loss return loss
def save_hard_example(save_size, save_dir, data_gt, det_boxes): """将网络识别的box用来裁剪原图像作为下一个网络的输入""" img_list = data_gt['images'] gt_boxes_list = data_gt['boxes'] num_of_images = len(img_list) assert len(det_boxes) == num_of_images, "弄错了" n_idx = 0 p_idx = 0 d_idx = 0 image_done = 0 positive_list = [] negative_list = [] part_list = [] for img, dets, gts in tqdm(zip(img_list, det_boxes, gt_boxes_list)): gts = np.array(gts, dtype=np.float32).reshape(-1, 4) image_done += 1 if dets is None or dets.shape[0] == 0: continue # img = cv2.imread(im_idx) # 转换成正方形 dets = convert_to_square(dets) dets[:, 0:4] = np.round(dets[:, 0:4]) neg_num = 0 for box in dets: x_left, y_top, x_right, y_bottom, _ = box.astype(int) width = x_right - x_left + 1 height = y_bottom - y_top + 1 # 除去过小的 if width < 20 or x_left < 0 or y_top < 0 or x_right > img.shape[1] - 1 or y_bottom > img.shape[0] - 1: continue iou = cal_iou(box, gts) cropped_im = img[y_top:y_bottom + 1, x_left:x_right + 1, :] resized_im = cv2.resize(cropped_im, (save_size, save_size), interpolation=cv2.INTER_LINEAR) # 划分种类 if np.max(iou) < 0.3 and neg_num < 60: negative_list.append(resized_im) n_idx += 1 neg_num += 1 else: idx = np.argmax(iou) assigned_gt = gts[idx] x1, y1, x2, y2 = assigned_gt # 偏移量 offset_x1 = (x1 - x_left) / float(width) offset_y1 = (y1 - y_top) / float(height) offset_x2 = (x2 - x_right) / float(width) offset_y2 = (y2 - y_bottom) / float(height) roi = np.array([float(offset_x1), float(offset_y1), float(offset_x2), float(offset_y2)]) # pos和part if np.max(iou) >= 0.65: positive_list.append([resized_im, roi]) p_idx += 1 elif np.max(iou) >= 0.4: part_list.append([resized_im, roi]) d_idx += 1 print('%s 个图片已处理,pos:%s part: %s neg:%s' % (image_done, p_idx, d_idx, n_idx)) base_num = 100000 if len(negative_list) > base_num * 3: neg_keep = npr.choice(len(negative_list), size=base_num * 3, replace=True) negative_list = np.asarray(negative_list)[neg_keep] sum_p = len(negative_list) // 3 pos_keep = npr.choice(len(positive_list), sum_p, replace=True) part_keep = npr.choice(len(part_list), sum_p, replace=True) positive_list = np.asarray(positive_list)[pos_keep] part_list = np.asarray(part_list)[part_keep] print('neg数量:{} pos数量:{} part数量:{}'.format(len(negative_list), len(pos_keep), len(part_keep))) create_h5_box(positive_list, filename=save_dir + '/positive.h5') create_h5_box(part_list, filename=save_dir + '/part.h5') create_h5_img(negative_list, filename=save_dir + '/negative.h5')
def forward(self, classification, localization, anchors, annotations): batch_size = classification.size()[0] cls_losses = [] loc_losses = [] for i in range(batch_size): pred_cls = classification[i, :, :] pred_loc = localization[i, :, :] anno = annotations[i, :, :] # 去掉为了batch中样本保持格式一致时,添加的[-1, -1, -1, -1, -1] anno = anno[anno[:, 0] != -1] # 首先考虑anno里什么都没有的情况 if anno.size()[0] == 0: if cuda: cls_losses.append(torch.tensor(0).float().cuda()) loc_losses.append(torch.tensor(0).float().cuda()) else: cls_losses.append(torch.tensor(0).float()) loc_losses.append(torch.tensor(0).float()) continue ''' 如果要对数值进行 log 操作,最好先对其进行 clamp 操作,防止其中存在极小值, 导致计算结果出现 nan。 ''' # 由于交叉熵要进行log()函数的运算,因此pred_cls中的数接近0或1时, # 会导致我们的交叉熵出现'nan',因此需要将其限定在一定的范围内 pred_cls = torch.clamp(pred_cls, min=1e-4, max=1-1e-4) iou = cal_iou(anchors[:, :], anno[:, :-1]) # 计算各个anchor相对于annotations中,IoU最大的值,并记录最大IoU的位置 # iou: [n, m] iou_max: [n, ] iou_max, iou_max_ind = torch.max(iou, dim=1) ''' 计算 classification 的 loss 引用 'Focal Loss for Dense Object Detection' 论文原文: 'Specifically, anchors are assigned to ground-truth object boxes using an intersection-over-union (IoU) threshold of 0.5; and to background if their IoU is in [0, 0.4). As each anchor is assigned to at most one object box, we set the corresponding entry in its length K label vector to 1 and all other entries to 0. If an anchor is unassigned, which may happen with overlap in [0.4, 0.5), it is ignored during training.' ''' # 用于存储anchor被分配的类别,positive使用onehot编码(代码在后面实现) # negative使用全0编码,不参与训练的anchor使用全-1编码 anchors_onehot = torch.ones(pred_cls.size()) * -1 # [-1, 80] if cuda: anchors_onehot = anchors_onehot.cuda() ''' torch.lt(input, other, out=None) :逐元素比较input和other,即是否input < other :param input(Tensor): 要对比的张量 :param other(Tensor or float): 对比的张量或float值 :param out(Tensor,可选的): 输出张量 ''' # negative的anchor使用全0编码 anchors_onehot[torch.lt(iou_max, 0.4), :] = 0 ''' torch.ge(input, other, out=None) :逐元素比较input和other,即是否input >= other。torch.gt()是判断input > other :param input(Tensor): 待对比的张量 :prarm other(Tensor or float): 对比的张量或float值 :param out(Tensor,可选的): 输出张量 ''' pos_ind = torch.ge(iou_max, 0.5) pos_num = pos_ind.sum() # print('positive anchor number:', pos_num) # 每一个anchor所属的GroundTruth位置和类别(按IoU计算结果分配), # shape: [-1, 5], [x1, y1, x2, y2, cls] gt = anno[iou_max_ind, :] # positive的anchor使用onehot编码 anchors_onehot[pos_ind, :] = 0 anchors_onehot[pos_ind, gt[pos_ind, -1].long()] = 1 if cuda: alpha = torch.ones(anchors_onehot.size()).cuda() * self.alpha else: alpha = torch.ones(anchors_onehot.size()) * self.alpha ''' torch.where(condition, x, y) → Tensor 针对于x而言,如果其中的每个元素都满足condition,就返回x的值; 如果不满足condition,返回y的值。 ''' alpha = torch.where(anchors_onehot.eq(1), alpha, 1 - alpha) pt = torch.where(anchors_onehot.eq(1), pred_cls, 1 - pred_cls) # focal_weight = alpha(1-pt)^gamma focal_weight = alpha * torch.pow((1 - pt), self.gamma) # 交叉熵 bce_loss = -1 * ( \ anchors_onehot * torch.log(pred_cls) + \ (1 - anchors_onehot) * torch.log(1 - pred_cls) \ ) cls_loss = focal_weight * bce_loss # 不参与训练的anchor的loss需要从cls_loss中删除 if cuda: cls_loss = torch.where( torch.eq(anchors_onehot, -1), torch.zeros(cls_loss.size()).cuda(), cls_loss ) else: cls_loss = torch.where( torch.eq(anchors_onehot, -1), torch.zeros(cls_loss.size()), cls_loss ) cls_losses.append(cls_loss.sum() / torch.clamp(pos_num.float(), min=1.0)) # print(cls_losses) ''' 计算 localization 的 loss ''' if pos_num <= 0: # 如果没有positive的anchor的话,我们将loc_loss置0 if cuda: loc_losses.append(torch.tensor(0).float().cuda()) else: loc_losses.append(torch.tensor(0).float().cuda()) else: anchors_w = anchors[:, 2] - anchors[:, 0] anchors_h = anchors[:, 3] - anchors[:, 1] anchors_cx = anchors[:, 0] + 0.5 * anchors_w anchors_cy = anchors[:, 1] + 0.5 * anchors_h # 在BP算法中,我们只训练positive的anchor pos_w = anchors_w[pos_ind] pos_h = anchors_h[pos_ind] pos_cx = anchors_cx[pos_ind] pos_cy = anchors_cy[pos_ind] pos_gt = gt[pos_ind, :] gt_w = pos_gt[:, 2] - pos_gt[:, 0] gt_h = pos_gt[:, 3] - pos_gt[:, 1] gt_cx = pos_gt[:, 0] + 0.5 * gt_w gt_cy = pos_gt[:, 1] + 0.5 * gt_h ''' 如果要对数值进行 log 操作,最好先对其进行 clamp 操作,防止其中存在极小值, 导致计算结果出现 nan。 ''' # 同样的,我们在计算loc_loss时,依然会进行log()函数的运算, # 如果gt_w, gt_h过小的话,对导致最终输出的loc_loss为'nan' gt_w = torch.clamp(gt_w, min=1) gt_h = torch.clamp(gt_h, min=1) # 计算神经网络需要学习到的位置回归偏差 dx = (gt_cx - pos_cx) / pos_w dy = (gt_cy - pos_cy) / pos_h dw = torch.log(gt_w / pos_w) dh = torch.log(gt_h / pos_h) d_stack = torch.stack((dx, dy, dw, dh)) d_stack = d_stack.t() # 转置 ''' 引用 'Focal Loss for Dense Object Detection' 论文原文: 'The training loss is the sum the focal loss and the standard smooth L1 loss used for box regression [10].' 因此,我们在计算loc_loss时,使用smooth L1 loss ''' if cuda: d_stack = d_stack / torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() else: d_stack = d_stack / torch.Tensor([[0.1, 0.1, 0.2, 0.2]]) loc_loss = torch.abs(d_stack - pred_loc[pos_ind, :]) ''' torch.le(input, other, out=None) :逐元素比较input和other,即是否input <= other. :param input(Tenosr): 要对比的张量 :param other(Tensor or float): 对比的张量或float值 :param out(Tensor,可选的): 输出张量 ''' loc_loss = torch.where( torch.le(loc_loss, 1.0 / 9.0), 0.5 * 9.0 * torch.pow(loc_loss, 2), loc_loss - 0.5 / 9.0 ) loc_losses.append(loc_loss.mean()) cls_loss = torch.stack(cls_losses).mean(dim=0, keepdim=True) loc_loss = torch.stack(loc_losses).mean(dim=0, keepdim=True) # print(cls_loss) # print(loc_loss) return cls_loss, loc_loss
def main(args): # Class category of PASCAL that the RL agent will be searching device = torch.device("cuda:0" if (torch.cuda.is_available() and args.use_gpu) else "cpu") image_names = np.array(load_images_names_in_data_set('aeroplane_trainval', path_voc)) feature_exactrator = torchvision.models.vgg16(pretrained=True).features.to(device) single_plane_image_names = [] single_plane_image_gts = [] dqn = DQN(device) EPISILO = args.EPISILO for image_name in image_names: annotation = get_bb_of_gt_from_pascal_xml_annotation(image_name, path_voc) if(len(annotation)>1): continue single_plane_image_names.append(image_name) single_plane_image_gts.append(annotation[0][1:]) #[[x1,x2,y1,y2] ...] trans = T.Compose([ T.Resize((224,224)), T.ToTensor(), ]) for i in range(epochs): ep_reward = 0 for index, image_name in enumerate(single_plane_image_names): image_path = os.path.join(path_voc + "JPEGImages", image_name + ".jpg") image_original = Image.open(image_path) width, height = image_original.size #image_original = image_original.resize((224,224)) bbx_gt = single_plane_image_gts[index] #draw = ImageDraw.Draw(image_original) #draw.rectangle([bbx_gt[0],bbx_gt[2],bbx_gt[1],bbx_gt[3]],outline='red') #image_original.show() #return image = init_process(image_original, trans).to(device) #print(image.shape) bbx = [0, width, 0, height] history_action = np.zeros(his_actions*NUM_ACTIONS) with torch.no_grad(): vector = feature_exactrator(image).cpu().detach().numpy().reshape(7*7*512) state = np.concatenate([history_action, vector]) step = 0 while(step<10): iou = cal_iou(bbx, bbx_gt) if iou>0.5: action = 5 else: action = dqn.choose_action(state, EPISILO) #print(action) #execute action and step to new bbx new_bbx = update_bbx(bbx, action) reward = reward_func(bbx, new_bbx, bbx_gt, action) #get new state action_vec = np.zeros(NUM_ACTIONS) action_vec[action] = 1.0 history_action = np.concatenate([history_action[NUM_ACTIONS:], action_vec]) with torch.no_grad(): vector = feature_exactrator(inter_process(image_original,new_bbx,trans).to(device)).cpu().detach().numpy().reshape(7*7*512) next_state = np.concatenate([history_action,vector]) #store transition dqn.store_transition(state, action, reward, next_state) ep_reward += reward if dqn.memory_counter >= MEMORY_CAPACITY: print("episode: {},".format(i),end=' ') dqn.learn() #termation if action==5: break state = next_state bbx = new_bbx step += 1 if (EPISILO>0.1): EPISILO -= 0.1 print("episode: {} , this epoch reward is {}".format(i, round(ep_reward, 3))) # 0.001 precision
def yolo_loss(y_true, y_pred): """ :param y_true: [batch_size, 7, 7, 25] :param y_pred: [batch_size, 7, 7, 30] :return: """ # 类别标签 _classes = y_pred[..., 10:] classes = y_true[..., 5:] # (batch_size, 7, 7, 2) _confidences = y_pred[..., 8:10] # (batch_size, 7, 7, 1) confidences = y_true[..., 4:5] # (batch_size, 7, 7, 4) bboxes = y_true[..., 0:4] # (batch_size, 7, 7, 1, 4) bboxes = tf.reshape(bboxes, (-1, cfg.CELL_SIZE, cfg.CELL_SIZE, 1, 4)) _bboxes = y_pred[..., 0:8] # (batch_size, 7, 7, 2, 4) _bboxes = tf.reshape(_bboxes, (-1, cfg.CELL_SIZE, cfg.CELL_SIZE, cfg.B, 4)) grid_x = tf.range(cfg.CELL_SIZE, dtype=tf.float32) grid_y = tf.range(cfg.CELL_SIZE, dtype=tf.float32) grid_x, grid_y = tf.meshgrid(grid_x, grid_y) x_offset = tf.reshape(grid_x, (-1, 1)) y_offset = tf.reshape(grid_y, (-1, 1)) x_y_offset = tf.concat([x_offset, y_offset], axis=-1) x_y_offset = tf.cast( tf.reshape(x_y_offset, [cfg.CELL_SIZE, cfg.CELL_SIZE, 1, 2]), tf.float32) # 将_bboxes转到原图上 _bboxes_normal = tf.stack([ (_bboxes[..., 0] + x_y_offset[..., 0]) / cfg.CELL_SIZE, (_bboxes[..., 1] + x_y_offset[..., 1]) / cfg.CELL_SIZE, tf.square(_bboxes[..., 2]), tf.square(_bboxes[..., 3]), ], axis=-1) # bboxes_ious: (n, 7, 7, 2) bboxes_ious = cal_iou(_bboxes_normal, bboxes) object_mask = tf.reduce_max(bboxes_ious, axis=-1, keep_dims=True) # 第i个cell第j个bbox负责产生损失 object_mask = tf.cast(bboxes_ious >= object_mask, dtype=tf.float32) * confidences noobject_mask = tf.ones_like(object_mask, dtype=tf.float32) - object_mask # _bboxes[..., 0:2] = (_bboxes[..., 0:2] + x_y_offset) / cfg.CELL_SIZE # bboxes = bboxes[..., 0:2] * cfg.CELL_SIZE - x_y_offset # bboxes = tf.sqrt(bboxes[..., 2:4]) bboxes_normal = tf.stack([ bboxes[..., 0] * cfg.CELL_SIZE - x_y_offset[..., 0], bboxes[..., 1] * cfg.CELL_SIZE - x_y_offset[..., 1], tf.sqrt(bboxes[..., 2]), tf.sqrt(bboxes[..., 3]), ], axis=-1) object_delta = object_mask * (_confidences - bboxes_ious) object_loss = tf.reduce_mean( tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3 ])) * cfg.OBJECT_SCALE onobject_delta = noobject_mask * _confidences nobject_loss = tf.reduce_mean( tf.reduce_sum(tf.square(onobject_delta), axis=[1, 2, 3 ])) * cfg.NOOBJECT_SCALE # 类别损失 cls_delta = confidences * (classes - _classes) cls_loss = tf.reduce_mean( tf.reduce_sum(tf.square(cls_delta), axis=[1, 2, 3])) * cfg.CLASS_SCALE # 边框损失 bbox_mask = tf.expand_dims(object_mask, axis=-1) bboxes_xy_delta = bbox_mask * (_bboxes[..., 0:2] - bboxes_normal[..., 0:2]) bboxes_wh_delta = bbox_mask * (_bboxes[..., 2:4] - bboxes_normal[..., 2:4]) bboxes_loss = tf.reduce_mean(tf.reduce_sum(tf.square(bboxes_xy_delta), axis=[1, 2, 3, 4])) * cfg.BBOX_SCALE + \ tf.reduce_mean(tf.reduce_sum(tf.square(bboxes_wh_delta), axis=[1, 2, 3, 4])) * cfg.BBOX_SCALE total_loss = cls_loss + object_loss + nobject_loss + bboxes_loss return total_loss