def forward(self, x, rois, roi_indices): """Forward the chain. We assume that there are :math:`N` batches. Args: x (Variable): 4D image variable. rois (Tensor): A bounding box array containing coordinates of proposal boxes. This is a concatenation of bounding box arrays from multiple images in the batch. Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed RoIs from the :math:`i` th image, :math:`R' = \\sum _{i=1} ^ N R_i`. roi_indices (Tensor): An array containing indices of images to which bounding boxes correspond to. Its shape is :math:`(R',)`. """ # in case roi_indices is ndarray roi_indices = at.totensor(roi_indices).float() rois = at.totensor(rois).float() indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1) # NOTE: important: yx->xy xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous()) pool = self.roi(x, indices_and_rois) pool = pool.view(pool.size(0), -1) fc7 = self.classifier(pool) roi_cls_locs = self.cls_loc(fc7) roi_scores = self.score(fc7) return roi_cls_locs, roi_scores
def train(**kwargs): opt._parse(kwargs) if not VOC: dataset = CsvDataset('/home/artemlyan/data/avito_intro/images/', 'labeled_with_classes.csv') print('load data') dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers) test_dataloader = data_.DataLoader(dataset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) else: dataset = Dataset(opt) print('load data for VOC') dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): if not VOC: dataset.set_mode('train') trainer.reset_meters() for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): print(img.size(), bbox_.size(), label_.size(), scale.size()) scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() img, bbox, label = Variable(img), Variable(bbox), Variable(label) trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) print('pred', _bboxes, 'gt', bbox_[0]) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img( 'roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) if not VOC: dataset.set_mode('val') eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) print("eval reuslt:", eval_result) if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch == 9: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay trainer.vis.plot('test_map', eval_result['map']) log_info = 'lr:{}, map:{},loss:{}'.format( str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) if epoch == 30: break
def train(**kwargs): opt._parse(kwargs) dataset = Dataset(opt) print('load data') dataloader = data_.DataLoader(dataset, batch_size=1, shuffle=True, # pin_memory=True, num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=2, shuffle=False, # pin_memory=True ) faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 for epoch in range(7): trainer.reset_meters() for ii, (img, bbox_, label_, scale, ori_img) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() losses = trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = (img * 0.225 + 0.45).clamp(min=0, max=1) * 255 gt_img = visdom_bbox(at.tonumpy(ori_img_)[0], at.tonumpy(bbox_)[0], label_[0].numpy()) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict(ori_img,visualize=True) pred_img = visdom_bbox( at.tonumpy(ori_img[0]), at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) if epoch==4: trainer.faster_rcnn.scale_lr(opt.lr_decay) eval_result = eval(test_dataloader, faster_rcnn, test_num=1e100) print('eval_result') trainer.save(mAP=eval_result['map'])
def train(**kwargs): opt._parse( kwargs ) #将调用函数时候附加的参数用,config.py文件里面的opt._parse()进行解释,然后获取其数据存储的路径,之后放到Dataset里面! dataset = Dataset(opt) print('load data') dataloader = data_.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader( testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, #pin_memory=True ) #pin_memory锁页内存,开启时使用显卡的内存,速度更快 faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() #判断opt.load_path是否存在,如果存在,直接从opt.load_path读取预训练模型,然后将训练数据的label进行可视化操作 if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.dataset.label_names, win='labels') best_map = 0 lr_ = opt.lr # 之后用一个for循环开始训练过程,而训练迭代的次数opt.epoch=14也在config.py文件中都预先定义好,属于超参数 for epoch in range(opt.epoch): print('epoch {}/{}'.format(epoch, opt.epoch)) trainer.reset_meters() #首先在可视化界面重设所有数据 for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): scale = array_tool.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() #可视化画出loss trainer.vis.plot_many(trainer.get_meter_data()) #可视化画出groudtruth bboxes ori_img_ = inverse_normalize(array_tool.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, array_tool.tonumpy(bbox_[0]), array_tool.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) #可视化画出预测bboxes # 调用faster_rcnn的predict函数进行预测,预测的结果保留在以_下划线开头的对象里面 _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox( ori_img_, array_tool.tonumpy(_bboxes[0]), array_tool.tonumpy(_labels[0]).reshape(-1), array_tool.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # 调用 trainer.vis.text将rpn_cm也就是RPN网络的混淆矩阵在可视化工具中显示出来 trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') #将roi_cm也就是roihead网络的混淆矩阵在可视化工具中显示出来 trainer.vis.img( 'roi_cm', array_tool.totensor(trainer.roi_cm.conf, False).float()) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{}, map:{}, loss:{}'.format( str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) #将学习率以及map等信息及时显示更新 if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch == 9: #if判断语句如果学习的epoch达到了9就将学习率*0.1变成原来的十分之一 trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay if epoch == 13: break
def train(**kwargs): opt._parse(kwargs) dataset = Dataset(opt) print('load data') dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=False, \ # pin_memory=True, num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): trainer.reset_meters() for ii, (img, bbox_, label_, scale, human_box, object_box, action) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() human_box, object_box, action = human_box.cuda(), object_box.cuda(), action.cuda() trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), # at.tonumpy(action[0]), at.tonumpy(label_[0]) ) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True) print(_labels[0]) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch == 9: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay if epoch == 13: break
def train(**kwargs): opt._parse(kwargs) #获得config设置信息 dataset = Dataset(opt) #传入opt,利用设置的数据集参数来创建训练数据集 print('load data') dataloader = data_.DataLoader(dataset, \ #用创建的训练数据集创建训练DataLoader,代码仅支持batch_size=1 batch_size=1, \ shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers) testset = TestDataset(opt) #传入opt,利用设置的数据集参数来加载测试数据集 test_dataloader = data_.DataLoader(testset, #用创建的测试数据集创建训练DataLoader,代码仅支持batch_size=1 batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) faster_rcnn = FasterRCNNVGG16() #创建以vgg为backbone的FasterRCNN网络 print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() #把创建好的FasterRCNN网络放入训练器 if opt.load_path: #若有FasterRCNN网络的预训练加载,则加载load_path权重 trainer.load(opt.load_path) #训练器加载权重 print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 #初始化best_map,训练时用于判断是否需要保存模型,类似打擂台后面用 lr_ = opt.lr #得到预设的学习率 for epoch in range(opt.epoch): #开始训练,训练次数为opt.epoch trainer.reset_meters() for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) #进行类别处理得到scale(待定) #bbox是gt_box坐标(ymin, xmin, ymax, xmax) #label是类别的下标VOC_BBOX_LABEL_NAMES #img是图片,代码仅支持batch_size=1的训练 img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() #使用gpu训练 trainer.train_step(img, bbox, label, scale) #预处理完毕,进入模型 if (ii + 1) % opt.plot_every == 0: #可视化内容,(跳过) if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) #训练一个epoch评估一次 trainer.vis.plot('test_map', eval_result['map']) #可视化内容,(跳过) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] #获得当前的学习率 log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), #日志输出学习率,map,loss str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) #可视化内容,(跳过) if eval_result['map'] > best_map: #若这次评估的map大于之前最大的map则保存模型 best_map = eval_result['map'] #保存模型的map信息 best_path = trainer.save(best_map=best_map) #调用保存模型函数 if epoch == 9: #若训练到第9个epoch则加载之前最好的模型并且减低学习率继续训练 trainer.load(best_path) #加载模型 trainer.faster_rcnn.scale_lr(opt.lr_decay) #降低学习率 lr_ = lr_ * opt.lr_decay #获得当前学习率 if epoch == 13: #13个epoch停止训练 break
def train_val(): print('load data') train_loader, val_loader = get_train_val_loader( opt.root_dir, batch_size=opt.batch_size, val_ratio=0.1, shuffle=opt.shuffle, num_workers=opt.num_workers, pin_memory=opt.pin_memory) faster_rcnn = FasterRCNNVGG16() # faster_rcnn = FasterRCNNResNet50() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() # if opt.load_path: # trainer.load(opt.load_path) # print('load pretrained model from %s' % opt.load_path) # trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): trainer.reset_meters() tqdm.monitor_interval = 0 for ii, sample in tqdm(enumerate(train_loader)): if len(sample.keys()) == 5: img_id, img, bbox, scale, label = sample['img_id'], sample['image'], sample['bbox'], sample['scale'], \ sample['label'] img, bbox, label = img.cuda().float(), bbox.cuda(), label.cuda( ) img, bbox, label = Variable(img), Variable(bbox), Variable( label) else: img_id, img, bbox, scale, label = sample['img_id'], sample['image'], np.zeros((1, 0, 4)), \ sample['scale'], np.zeros((1, 0, 1)) img = img.cuda().float() img = Variable(img) if bbox.size == 0: continue scale = at.scalar(scale) trainer.train_step(img_id, img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot ground truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, img_id[0], at.tonumpy(bbox[0]), at.tonumpy(label[0])) trainer.vis.img('gt_img', gt_img) # plot predicted bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, img_id[0], at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img( 'roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) mAP = eval_mAP(trainer, val_loader) trainer.vis.plot('val_mAP', mAP) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{}, map:{},loss:{}'.format( str(lr_), str(mAP), str(trainer.get_meter_data())) trainer.vis.log(log_info) if mAP > best_map: best_map = mAP best_path = trainer.save(best_map=best_map) if epoch == opt.epoch - 1: best_path = trainer.save() if (epoch + 1) % 10 == 0: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay
def forward(self, x, rois, roi_indices, context_rois, context_roi_indices): # def forward(self, x, rois, roi_indices): """Forward the chain. We assume that there are :math:`N` batches. Args: x (Variable): 4D image variable. rois (Tensor): A bounding box array containing coordinates of proposal boxes. This is a concatenation of bounding box arrays from multiple images in the batch. Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed RoIs from the :math:`i` th image, :math:`R' = \\sum _{i=1} ^ N R_i`. roi_indices (Tensor): An array containing indices of images to which bounding boxes correspond to. Its shape is :math:`(R',)`. """ # in case roi_indices is ndarray roi_indices = at.totensor(roi_indices).float() rois = at.totensor(rois).float() indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1) # NOTE: important: yx->xy xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] indices_and_rois = xy_indices_and_rois.contiguous() pool = self.roi(x, indices_and_rois) pool = pool.view(pool.size(0), -1) fc7 = self.classifier(pool) roi_cls_locs = self.cls_loc(fc7) roi_scores = self.score(fc7) # print(f"score: {roi_scores}") # context_roi_scores = 0 # in case roi_indices is ndarray context_roi_indices = at.totensor(context_roi_indices).float() context_rois = at.totensor(context_rois).float() context_indices_and_rois = t.cat( [context_roi_indices[:, None], context_rois], dim=1) # NOTE: important: yx->xy context_xy_indices_and_rois = context_indices_and_rois[:, [0, 2, 1, 4, 3]] context_indices_and_rois = context_xy_indices_and_rois.contiguous() # print(f"DEBUG: {context_indices_and_rois.shape}") context_pool = self.context_roi(x, context_indices_and_rois) context_pool = context_pool.view(context_pool.size(0), -1) context_fc7 = self.context_classifier(context_pool) # context_roi_cls_locs = self.context_cls_loc(context_fc7) # context_roi_scores = self.context_score(context_fc7) # gating ex_feat = self.gating_module(fc7, context_fc7) ex_feat = ex_feat.view(ex_feat.size(0), -1) ex_scores = self.cls_score(ex_feat) # print(f"cls_score: {ex_scores}") roi_scores = ex_scores # context relevance score # context_relevance_roi_indices = at.totensor(context_roi_indices).float() # context_relevance_rois = at.totensor(context_rois).float() # context_relevance_indices_and_rois = t.cat( # [context_relevance_roi_indices[:, None], context_relevance_rois], dim=1) # # NOTE: important: yx->xy # context_relevance_xy_indices_and_rois = context_relevance_indices_and_rois[:, [ # 0, 2, 1, 4, 3]] # context_relevance_indices_and_rois = context_relevance_xy_indices_and_rois.contiguous() # context_relevance_pool = self.context_relevance_roi(x, context_relevance_indices_and_rois) # context_relevance_pool = context_relevance_pool.view(context_relevance_pool.size(0), -1) context_relevance_pool = self.avgpool(x) context_relevance_pool = t.flatten(context_relevance_pool, 1) context_relevance_fc7 = self.context_relevance_classifier( context_relevance_pool) context_relevance_roi_scores = self.context_relevance_score( context_relevance_fc7) return roi_cls_locs, roi_scores, context_relevance_roi_scores
def train(**kwargs): opt._parse(kwargs) dataset = Dataset(opt) print('load data') dataloader = data_.DataLoader(dataset, batch_size=1, shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, pin_memory=True) faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 best_ap = np.array([0.] * opt.label_number) lr_ = opt.lr vis = trainer.vis starttime = datetime.datetime.now() for epoch in range(opt.epoch): trainer.reset_meters() for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix roi_cm = at.totensor(trainer.roi_cm.conf, False).float() trainer.vis.img('roi_cm', roi_cm) eval_result = eval(test_dataloader, faster_rcnn, vis=vis, test_num=opt.test_num) best_ap = dict(zip(opt.VOC_BBOX_LABEL_NAMES, eval_result['ap'])) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{}, map:{},loss:{}'.format( str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) if eval_result['map'] > best_map: print('roi_cm=\n', trainer.roi_cm.value()) plot_confusion_matrix(trainer.roi_cm.value(), classes=('animal', 'plant', 'rock', 'background'), normalize=False, title='Normalized Confusion Matrix') best_map = eval_result['map'] best_path = trainer.save(best_map=best_map, best_ap=best_ap) if epoch == 9: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay # if epoch == 13: # break endtime = datetime.datetime.now() train_consum = (endtime - starttime).seconds print("train_consum=", train_consum)
def forward(self, imgs, bboxes, labels, scale): ''' :param imgs: (~torch.autograd.Variable) 一个批次的图片 :param bboxes: (~torch.autograd.Variable) (N, R, 4) :param labels: (~torch.autograd..Variable) (N, R) [0 - L-1] L为类别数 :param scale: (float) 原图经过preprocessing处理后的缩放比 :return: namedtuple of 5 losses ''' n = bboxes.shape[0] #batch_size 数量 if n != 1: raise ValueError('Currently only batch size 1 is supported') _, _, H, W = imgs.shape img_size = (H, W) c2_out = self.faster_rcnn.C2(imgs) c3_out = self.faster_rcnn.C3(c2_out) c4_out = self.faster_rcnn.C4(c3_out) p2, p3, p4, p5 = self.faster_rcnn.fpn(c2_out, c3_out, c4_out) feature_maps = [p2, p3, p4, p5] rcnn_maps = [p2, p3, p4] # rpn_locs的维度(hh*ww*9,4),rpn_scores维度为(hh*ww*9,2), rois的维度为(2000,4), # roi_indices用不到,anchor的维度为(hh*ww*9,4),H和W是经过数据预处理后的。 # 计算(H/16)x(W/16)x9(大概20000)个anchor属于前景的概率,取前12000个并经过NMS得到2000个近似目标框G^的坐标。 # roi的维度为(2000,4) rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn( feature_maps, img_size, scale) bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] #(hh*ww*9,2) rpn_loc = rpn_locs[0] #(hh*ww*9,4) roi = rois #(2000,4) # 调用proposal_target_creator函数生成sample roi(128,4)、gt_roi_loc(128,4)、 # gt_roi_label(128,1),RoIHead网络利用这sample_roi+featue为输入, # 输出是分类(21类)和回归(进一步微调bbox)的预测值, # 那么分类回归的groud truth就是ProposalTargetCreator输出的gt_roi_label和gt_roi_loc。 sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, array_tool.tonumpy(bbox), array_tool.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) sample_roi_index = torch.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head( rcnn_maps, sample_roi, sample_roi_index) #------------------RPN loss------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( array_tool.tonumpy(bbox), anchor, img_size) gt_rpn_label = array_tool.totensor(gt_rpn_label).long() gt_rpn_loc = array_tool.totensor(gt_rpn_loc) #rpn的回归l1smooth损失 rpn_loc_loss = _fast_rcnn_loc_loss( rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) #rpn的分类交叉熵损失 rpn_cls_loss = functional.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _gt_rpn_score = rpn_score[gt_rpn_label > -1] _rpn_score = array_tool.tonumpy(rpn_score)[array_tool.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(array_tool.totensor(_rpn_score, False), _gt_rpn_label.data.long()) #------------------------ROI loss------------------------# n_sample = roi_cls_loc.shape[0] #n_sample为128 , roi_cls_loc为VGG16RoIHead的输出(128*84) roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) # roi_cls_loc=(128,21,4) roi_loc = roi_cls_loc[torch.arange(0, n_sample).long().cuda(), \ array_tool.totensor(gt_roi_label).long()] # (128,4),按照label编号从21类中挑出当前标签的loc,从(128,21,4)降为(128,4) gt_roi_label = array_tool.totensor(gt_roi_label).long() gt_roi_loc = array_tool.totensor(gt_roi_loc) #roi的回归l1smooth损失 roi_loc_loss = _fast_rcnn_loc_loss( roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) #roi的交叉熵损失 self.roi_cm.add(array_tool.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] #总loss,增加losses列表长度到5 return LossTuple(*losses)
def forward(self, imgs, bboxes, labels, scale): # 获取batch个数 n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape # (n,c,hh,ww) img_size = (H, W) # vgg16 conv5_3之前的部分提取图片的特征 features = self.faster_rcnn.extractor(imgs) # rpn_locs的维度(hh*ww*9,4),rpn_scores维度为(hh*ww*9,2), # rois的维度为(2000,4),roi_indices用不到,anchor的维度为 # (hh*ww*9,4),H和W是经过数据预处理后的。计算(H/16)x(W/16)x9 # (大概20000)个anchor属于前景的概率,取前12000个并经过NMS得到2000个 # 近似目标框G^的坐标。roi的维度为(2000,4) rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form # bbox维度(N, R, 4) bbox = bboxes[0] # labels维度为(N,R) label = labels[0] #hh*ww*9 rpn_score = rpn_scores[0] # hh*ww*9 rpn_loc = rpn_locs[0] # (2000,4) roi = rois # Sample RoIs and forward # 调用proposal_target_creator函数生成sample roi(128,4)、 # gt_roi_loc(128,4)、gt_roi_label(128,1),RoIHead网络 # 利用这sample_roi+featue为输入,输出是分类(21类)和回归 # (进一步微调bbox)的预测值,那么分类回归的groud truth就 # 是ProposalTargetCreator输出的gt_roi_label和gt_roi_loc。 sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) # roi回归输出的是128*84和128*21,然而真实位置参数是128*4和真实标签128*1 roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# # 输入20000个anchor和bbox,调用anchor_target_creator函数得到 # 2000个anchor与bbox的偏移量与label gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.totensor(gt_rpn_label).long() gt_rpn_loc = at.totensor(gt_rpn_loc) # 下面分析_fast_rcnn_loc_loss函数。rpn_loc为rpn网络回归出来的偏移量 # (20000个),gt_rpn_loc为anchor_target_creator函数得到2000个anchor # 与bbox的偏移量,rpn_sigma=1. rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... # rpn_score为rpn网络得到的(20000个)与anchor_target_creator # 得到的2000个label求交叉熵损失 rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] #不计算背景类 _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# # roi_cls_loc为VGG16RoIHead的输出(128*84), n_sample=128 n_sample = roi_cls_loc.shape[0] # roi_cls_loc=(128,21,4) roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] # proposal_target_creator()生成的128个proposal与bbox求得的偏移量 # dx,dy,dw,dh gt_roi_label = at.totensor(gt_roi_label).long() # 128个标签 gt_roi_loc = at.totensor(gt_roi_loc) # 采用smooth_l1_loss roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) # 求交叉熵损失 roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) # 四个loss加起来 losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
for i, (img, bbox_, label_, scale) in enumerate(dataloader): t1 = time.time() optimizer.zero_grad() img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() rpn_model.cuda() _, _, rois, _, _ = rpn_model.forward(img, scale) rpn_model.cpu() bbox = bbox[0] label = label[0] sample_roi, gt_roi_loc, gt_roi_label = model.PTC( rois, at.tonumpy(bbox), at.tonumpy(label)) gt_roi_label = at.totensor(gt_roi_label).long() gt_roi_loc = at.totensor(gt_roi_loc) for roi, roi_label, roi_loc in zip(sample_roi, gt_roi_label, gt_roi_loc): roi_cls_reg_locs, roi_clf_score = model.forward(img, roi) cls_reg_loss, cls_loss, reg_loss = model.loss( roi_clf_score, roi_cls_reg_locs, roi_loc, roi_label) cls_reg_loss.backward() for group in optimizer.param_groups: for p in group['params']: state = optimizer.state[p] if ('step' in state and state['step'] >= 1024): state['step'] = 1000 optimizer.step() c_loss.append(cls_loss)
def train(): # opt._parse(kwargs) best_map = float('-inf') faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) state_dict = torch.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) best_map = state_dict['other_info']['best_map'] trainer.vis.text(dataset.db.label_names, win='labels') lr_ = opt.lr for epoch in range(opt.epoch): trainer.reset_meters() for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img( 'roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{}, map:{},loss:{}'.format( str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) print(log_info) if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch % 5 == 4: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay
def predict(self, imgs, sizes=None, visualize=False): ''' 对每张图片进行预测, Args: 输入图片必须是CHW格式的RGB,是np.ndarry Return: 返回的是一个tuple,包含:框的坐标,标签,得分 (bboxes,labels,scores) ''' self.eval() if visualize: #可视化 self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] #get width&height #TODO:为什么可视化需要随机处理 img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = [] scores = [] for img, size in zip(prepared_imgs, sizes): img = at.totensor(img[None]).float() scale = img.shape[3] / size[1] #TODO:调用forward函数,为什么可以这么调用 roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) #TODO:.data是什么作用 roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale mean = t.Tensor(self.loc_normalize_mean).cuda().repeat( self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda().repeat( self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) #TODO: 这个会有变形的作用吗 roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox( at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor((cls_bbox)) cls_bbox = cls_bbox.view(-1, self.n_class * 4) '''clamp表示将tensor限制在其范围,让框不超过图片''' cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def train(opt, faster_rcnn, dataloader, val_dataloader, test_dataloader, trainer, lr_, best_map, start_epoch): trainer.train() for epoch in range(start_epoch, start_epoch+opt.epoch): trainer.reset_meters() pbar = tqdm(enumerate(dataloader), total=len(dataloader)) for ii, (img, bbox_, label_, scale) in pbar: # Currently configured to predict (y_min, x_min, y_max, x_max) # bbox_tmp = bbox_.clone() # bbox_ = transform_bbox(bbox_) scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() losses = trainer.train_step(img, bbox, label, scale) if ii % 100 == 0: rpnloc = losses[0].cpu().data.numpy() rpncls = losses[1].cpu().data.numpy() roiloc = losses[2].cpu().data.numpy() roicls = losses[3].cpu().data.numpy() tot = losses[4].cpu().data.numpy() pbar.set_description(f"Epoch: {epoch} | Batch: {ii} | RPNLoc Loss: {rpnloc:.4f} | RPNclc Loss: {rpncls:.4f} | ROIloc Loss: {roiloc:.4f} | ROIclc Loss: {roicls:.4f} | Total Loss: {tot:.4f}") if (ii+1) % 1000 == 0: eval_result = eval(val_dataloader, faster_rcnn, test_num=1000) trainer.vis.plot('val_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] val_log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(val_log_info) print("Evaluation Results on Val Set ") print(val_log_info) print("\n\n") if (ii + 1) % 100 == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) print(trainer.get_meter_data()) try: ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) plt.show() # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) plt.show() trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) except: print("Cannot display images") if (ii + 1) % 100 == 0: eval_result = eval(val_dataloader, faster_rcnn, test_num=25) trainer.vis.plot('val_map', eval_result['map']) log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), str( eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) # Save after every epoch epoch_path = trainer.save(epoch, best_map=0) eval_result = eval(test_dataloader, faster_rcnn, test_num=1000) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] test_log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(test_log_info) print("Evaluation Results on Test Set ") print(test_log_info) print("\n\n") if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = epoch_path if epoch == 9: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay if epoch == 13: break
def forward(self, imgs, bboxes, labels, scale): n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) # feature extraction features = self.faster_rcnn.extractor(imgs) # RPN network rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs and forward sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) # Faster rcnn head roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), at.totensor(gt_roi_label).long()] gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc.float(), gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] # number of input images one time if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape # should be (1,3,H,W) img_size = (H, W) # need more feature maps here when you are trying to use features of different scale features = self.faster_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.faster_rcnn.rpn( features, img_size, scale) # Since batch size is one, convert variables to singular form # different parameters here : # num_boxes : number of ground truth bounding boxes in a image. # num_anchors : number of anchors in images(or to say in a feature map). # num_rois : number of ROIs that are generated by RPN, which will be used in Fast RCNN. bbox = bboxes[0] # shape (num_boxes, 4) label = labels[0] # shape (num_boxes,) rpn_score = rpn_scores[0] # shape (num_anchors,) rpn_loc = rpn_locs[0] # shape (num_anchors, 4) roi = rois # shape (num_rois, 4) search_region = search_regions # shape (num_rois, 4) # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, sample_search_region, ( Tx, Ty), gt_roi_label = self.proposal_target_creator( roi, search_region, at.tonumpy(bbox), at.tonumpy(label)) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) (px, py), roi_score = self.faster_rcnn.head(features, sample_roi, sample_search_region, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = px.shape[0] # (px, py) and (Tx, Ty) are to be used to caculate loss :roi_loc_loss Tx = at.tovariable(Tx).float() Ty = at.tovariable(Ty).float() print("px is ", px) # print("max of px is ", t.max(px)) # print("min of px is ", t.min(px)) # print(t.max(Tx)) # print(t.max(Ty)) # print(Tx.shape, Ty.shape, px.shape, py.shape) roi_loc_loss = _LocNet_loss(Tx, Ty, px, py, gt_roi_label.data, self.roi_sigma) gt_roi_label = at.tovariable(gt_roi_label).long() roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] print("losses", losses) losses = losses + [sum(losses)] return LossTuple(*losses) # return a namedtuple
if opt.use_adam: self.optimizer = torch.optim.Adam(params) else: self.optimizer = torch.optim.SGD(params, momentum=0.9) return self.optimizer def scale_lr(self, decay=0.1): for param_group in self.optimizer.param_groups: param_group['lr'] *= decay return self.optimizer if __name__ == '__main__': img = np.ones((3, 5, 5), dtype=np.float32) b = array_tool.totensor(img[None]).float() loc_normalize_mean = (0., 0., 0., 0.) roi_cls_loc = np.ones((1, 84), dtype=np.float32) mean = torch.Tensor(loc_normalize_mean).cuda() mean = mean.repeat(21)[None] loc_normalize_std = (0.1, 0.1, 0.2, 0.2) std = torch.Tensor(loc_normalize_std).cuda() std = std.repeat(21)[None] roi_cls_loc = array_tool.totensor(roi_cls_loc) roi_cls_loc = roi_cls_loc.data roi_cls_loc = (roi_cls_loc * std + mean) print(roi_cls_loc.size()) roi_cls_loc = roi_cls_loc.view(-1, 21, 4) print(roi_cls_loc.size()) roi = np.zeros((1, 4), dtype=np.float32) roi = array_tool.totensor(roi)
def train(**kwargs): # opt._parse(kwargs) print('load data') dataloader = get_train_loader(opt.root_dir, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.num_workers, pin_memory=opt.pin_memory) faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() # if opt.load_path: # trainer.load(opt.load_path) # print('load pretrained model from %s' % opt.load_path) # trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): trainer.reset_meters() for ii, sample in tqdm(enumerate(dataloader)): if len(sample.keys()) == 5: img_id, img, bbox_, scale, label_ = sample['img_id'], sample['image'], sample['bbox'], sample['scale'], \ sample['label'] img, bbox, label = img.cuda().float(), bbox_.cuda( ), label_.cuda() img, bbox, label = Variable(img), Variable(bbox), Variable( label) else: img_id, img, bbox, scale, label = sample['img_id'], sample['image'], np.zeros((1, 0, 4)), \ sample['scale'], np.zeros((1, 0, 1)) img = img.cuda().float() img = Variable(img) # if label.size == 0: # continue scale = at.scalar(scale) trainer.train_step(img_id, img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot ground truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicted bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img( 'roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) if epoch % 10 == 0: best_path = trainer.save(best_map=best_map)
def predict(self, imgs,sizes=None,visualize=False): #预测函数 """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() #网络设置为eval模式(禁用BatchNorm和Dropout) if visualize: #可视化内容,(跳过) self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() #最终的输出框 labels = list() #最终的输出label scores = list() #最终的输出分数 for img, size in zip(prepared_imgs, sizes): img = at.totensor(img[None]).float() #增加batch维 scale = img.shape[3] / size[1] #获得scale(待定) roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) #前向 # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale #把rois变回原图尺寸(待定) # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] #Q:看网上说ProposalCreator坐标归一化了所以这里要返回原图,但是我没看到。疑问 #A:我觉得"ProposalCreator坐标归一化了"这个有错误,这里要反归一化是因为训练的时候使用的loc归一化了(ProposalTargetCreator),所以预测结果loc是归一化后的,并不是ProposalCreator时候归一化了 roi_cls_loc = (roi_cls_loc * std + mean) #坐标反归一化 roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) #一个框对应n_class个loc,所以要expand_as到同维度后面可以二次修正框 #二次修正框得到最后框 cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) #限制超出尺寸的框 cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) #限制超出尺寸的框 #softmax得到每个框的类别概率 prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) #输入框以及对应的类别概率,抑制输出 bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) #输出坐标,类别,该类别概率 bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') #可视化内容,(跳过) self.train() #返回train模式 return bboxes, labels, scores
def predict(self, imgs,sizes=None,visualize=False): """Detect objects from images. 从图像中检测物体 This method predicts objects for each image. 此方法预测每个图像的对象。 Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ #将模块设置为评估模式。这只对诸如Dropout或BatchNorm等模块有任何影响。module中的方法 self.eval() #可视化 if visualize: #设置为可视化 设置 self.nms_thresh = 0.3 self.score_thresh = 0.7 #评估模式 和 可视化模式 使用不同的nms最大化抑制 和阈值 self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] # print('nei img shape is ', img.shape) img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() #size[600,800] # print('sizes is ', sizes) for img, size in zip(prepared_imgs, sizes): #img由[3,600,800]转为[1,3,600,800] 转为变量,扩充一维 并设置为 预测模式 img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) #scale 为1 scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Faster网络的前向传播、计算losses************************* Here are notations used. * :math:`N` is the batch size. `N`是批量大小 * :math:`R` is the number of bounding boxes per image. `R`是每个图像的边界框的数量 Currently, only :math:`N=1` is supported. 当前模型,只有N=1可用 Args: imgs (~torch.autograd.Variable): A variable with a batch of images. batch=1的图片变量 bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. 真实人工标注的bboxes变量 labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. 背景被排除在定义之外,这意味着值的范围。`L`是前景类的数量 scale (float): Amount of scaling applied to the raw image during preprocessing. 预处理期间应用于原始图像的缩放量 Returns: namedtuple of 5 losses 五个损失 """ n = bboxes.shape[0] #判断,只支持batch为1 if n != 1: raise ValueError('Currently only batch size 1 is supported.') #img_size=原图像的高、宽 _, _, H, W = imgs.shape img_size = (H, W) #通过提取器(预训练好的VGG16)网络提取特征 features = self.faster_rcnn.extractor(imgs) #通过rpn网络(区域提案网络)得到 #rpn这是一个区域提案网络。它提取图像特征,预测输出rois #rpn_locs[1,17316,4] rpn_scores[1,17316,2] rois[2000,4] roi_indices[2000,]全为0 anchor [17316,4] rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form # 由于批量大小为1,因此将变量转换为单数形式(即压缩第一维) #bbox变为[1,4] bbox = bboxes[0] label = labels[0] #则rpn_score变为[17316,4] rpn_loc 变为[17316,2] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] #大约2000个rois roi = rois # Sample RoIs and forward 简单的ROIs和前向传播 # it's fine to break the computation graph of rois, consider them as constant input #打破rois的计算图,将它作为一个固定不变的输入 #proposal_target_creator 输入为rois(2000个候选框,和人工标注的bbox)用于生成训练目标,只训练用到 #2000个rois选出128个 #sample_roi[128,4] gt_roi_loc[128,4] gt_roi_label[128,] 值为0或1 表示正负样本 sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now #它全部为零,因为现在它只支持batch = 1 sample_roi_index = t.zeros(len(sample_roi)) #roi head网络进行预测类别和目标框 #RoIHead: 负责对rois分类和微调。对RPN找出的rois,判断它是否包含目标,并修正框的位置和座标 #使用RoIs提议的的feature maps,对RoI中的对象进行分类并提高目标框定位 #roi_cls_loc roi的分类、回归 #传入 特征提取的features 和 128个ROI #roi_cls_loc [128,84]回归定位 roi_score[128,21]分类(20类加背景) roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# #真实标注的bbox,预测出来的anchor锚点 # 将真实的bbox分配给锚点,返回 经过rpn后对应的定位和标签 #gt_rpn_loc[17316,4] gt_rpn_label [17316,] gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) #转为变量V 转为long型 gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) #rpn的回归定位损失 rpn_loc_loss[1] rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... #ignore_index的默认值是 - 100... #F:pytorch的function #分类使用交叉熵 rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] #添加进rpn 混淆矩阵 self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# #roi分类和回归 压缩第一维 #n_sample 128 n_sample = roi_cls_loc.shape[0] #改变形状为[ 32,4] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) #得到roi的回归 roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] # gt_roi_label:真实roi的标签 #gt_roi_loc:真实roi的回归 gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) #roi的回归损失 计算回归定位的损失 roi_loc_loss = _fast_rcnn_loc_loss( #contiguous从不连续调整为连续 roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) #roi分类损失(交叉熵) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) #添加进roi 混淆矩阵 self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) #计算总损失 losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] #返回Tuple,四个损失+总损失 return LossTuple(*losses)
def predict(self, imgs, sizes=None, visualize=False): """Detect objects from images This method predicts objects for each image Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() if visualize: self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] # [H, W], img.shape: [C, H, W] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = at.totensor(img[None]).float() # img[None] add a new axis shape: [C, H, W] -> [1, C, H, W] scale = img.shape[3] / size[1] # new_W / ori_W roi_cls_loc, roi_scores, rois, _ = self(img, scale) # 这里自己调用__call__ # assuming that batch size is 1 roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda().repeat( self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda().repeat( self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox( at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2].clamp(min=0, max=size[0])) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2].clamp(min=0, max=size[1])) prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) score.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def train(**kwargs): opt._parse(kwargs) print('dataset = Dataset(opt)') transform = transforms.Compose([ # you can add other transformations in this list transforms.ToTensor() ]) dataset = Dataset(opt, transform=transform) dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers, ) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) print('faster_rcnn = FasterRCNNVGG16()') faster_rcnn = FasterRCNNVGG16() trainer = FasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s at trian.py line 70' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): trainer.reset_meters() for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): print("tqdm(enumerate(dataloader)):") scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() img, bbox, label = Variable(img), Variable(bbox), Variable(label) print("train.py trainer.train_step(img, bbox, label, scale)") print(img.shape) print(bbox.shape) print(label.shape) trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss print("trian.py line94") print(trainer.get_meter_data()) trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img( 'roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch == 9: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay trainer.vis.plot('test_map', eval_result['map']) log_info = 'lr:{}, map:{},loss:{}'.format( str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) if epoch == 13: break
def train(**kwargs): opt._parse(kwargs) dataset = Dataset(opt) # 300w_dataset = FaceLandmarksDataset() print('load data') dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=True, \ pin_memory=True,\ num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) faster_rcnn = FasterRCNNVGG16() print('model construct completed') attacker = attacks.DCGAN(train_adv=False) if opt.load_attacker: attacker.load(opt.load_attacker) print('load attacker model from %s' % opt.load_attacker) trainer = VictimFasterRCNNTrainer(faster_rcnn, attacker, attack_mode=True).cuda() # trainer = VictimFasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') # eval_result = eval(test_dataloader, faster_rcnn, test_num=2000) best_map = 0 for epoch in range(opt.epoch): trainer.reset_meters(adv=True) for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): ipdb.set_trace() scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() img, bbox, label = Variable(img), Variable(bbox), Variable(label) trainer.train_step(img, bbox, label, scale) if (ii) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) trainer.vis.plot_many(trainer.get_meter_data(adv=True)) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicted bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) if trainer.attacker is not None: adv_img = trainer.attacker.perturb(img) adv_img_ = inverse_normalize(at.tonumpy(adv_img[0])) _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [adv_img_], visualize=True) adv_pred_img = visdom_bbox( adv_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('adv_img', adv_pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img( 'roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) if (ii) % 500 == 0: best_path = trainer.save(epochs=epoch, save_rcnn=True) if epoch % 2 == 0: best_path = trainer.save(epochs=epoch)
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] #bbox的个数 if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape # 图像的高宽 img_size = (H, W) features = self.faster_rcnn.extractor(imgs) #features # rpn_locs 的维度( hh*ww*9 , 4 ), rpn_scores 维度为( hh*ww*9 , 2 ) , # rois 的维度为( 2000,4 ), roi_indices 用不到, anchor 的维度为 # ( hh*ww*9 , 4 ), H 和 W 是经过数据预处理后的。计算( H/16 ) x(W/16)x9 # ( 大概 20000) 个 anchor 属于前景的概率 , 取前 12000 个并经过 NMS 得到 2000 个 # 近似目标框 G^ 的坐标。 roi 的维度为 (2000,4) rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] #[N,R,4] label = labels[0] #[N,R] rpn_score = rpn_scores[0] #hh*ww*9 rpn_loc = rpn_locs[0] #hh*ww*9 roi = rois #[2000,4] # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input # 调用proposal_target_creator函数生成sample roi(128, 4)、 # gt_roi_loc(128, 4)、 gt_roi_label(128, 1), # RoIHead网络利用这 # sample_roi + featue为输入, 输出是分类(21 # 类)和回归(进一步微调 bbox)的预测值, 那么分类回归的 # groudtruth # ProposalTargetCreator输出的gt_roi_label/gt_roi_loc sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# # 输入 20000 个 anchor 和 bbox ,调用 anchor_target_creator 函数得到 # 2000 个 anchor 与 bbox 的偏移量与 label gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.totensor(gt_rpn_label).long() gt_rpn_loc = at.totensor(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] # ------------------ ROI losses (fast rcnn loss) -------------------# # roi_cls_loc 为 VGG16RoIHead 的输出( 128,84) , n_sample=128 n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) #[128,21,4] roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] gt_roi_label = at.totensor(gt_roi_label).long() gt_roi_loc = at.totensor(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def forward(self, imgs, bboxes, labels, scale): """前向传播过程计算损失 参数: imgs: [N, C, H, W] bboxes: [N, R, 4] labels: [N, R] scale: 单个值就可以 返回:5个损失""" num_batch = bboxes.shape[0] if num_batch != 1: raise ValueError("仅支持batch_size=1") # 得到图片的尺寸H, W _, _, H, W = imgs.shape img_size = (H, W) # 得到特征图 features = self.faster_rcnn.extractor(imgs) # 进入rpn网络, 输出预测的锚点框预测偏移量和得分 rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn( features, img_size, scale) # 由于batch size为1,所以取其中的元素为: bbox = bboxes[0] label = labels[0] rpn_loc = rpn_locs[0] rpn_score = rpn_scores[0] roi = rois # 产生锚点框的真实偏移量和标签 gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( bbox=tonumpy(data=bbox), anchor=anchor, img_size=img_size) # 产生候选框的真实偏移量和标签 sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi=roi, bbox=tonumpy(bbox), label=tonumpy(label), loc_normalize_mean=self.loc_normalize_mean, loc_normalize_std=self.loc_normalize_std) # 由于batch_size=1,所以sample_roi_indice都为0 sample_roi_index = torch.zeros(len(sample_roi)) # 产生由候选框产生的预测框的偏移量和得分 roi_cls_loc, roi_score = self.faster_rcnn.head( x=features, rois=sample_roi, roi_indices=sample_roi_index) # ------------------------rpn loss----------------------------------# gt_rpn_label = totensor(data=gt_rpn_label).long() gt_rpn_loc = totensor(data=gt_rpn_loc) rpn_loc_loss = _faster_rcnn_loc_loss(pred_loc=rpn_loc, gt_loc=gt_rpn_loc, gt_label=gt_rpn_label.data, sigma=self.rpn_sigma) rpn_cls_loss = F.cross_entropy(input=rpn_score, target=gt_rpn_label.cuda(), ignore_index=-1) # 除了标签为-1之外的真实标签 _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = tonumpy(data=rpn_score)[tonumpy(data=gt_rpn_label) > -1] self.rpn_cm.add(predicted=totensor(data=_rpn_score, cuda=False), target=_gt_rpn_label.data.long()) # ---------------------roi loss---------------------------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) # 取出gt_roi_label对应的预测框的预测偏移量 roi_loc = roi_cls_loc[torch.arange(0, n_sample), totensor(data=gt_roi_label).long()] gt_roi_loc = totensor(data=gt_roi_loc) gt_roi_label = totensor(data=gt_roi_label).long() roi_loc_loss = _faster_rcnn_loc_loss(pred_loc=roi_loc.contiguous(), gt_loc=gt_roi_loc, gt_label=gt_roi_label.data, sigma=self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(predicted=totensor(roi_score, False), target=gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def train(**kwargs): opt._parse(kwargs) dataset = Dataset(opt) print('load data') dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) faster_rcnn = FasterRCNNVGG16() print('model construct completed') trainer = FasterRCNNTrainer(faster_rcnn).cuda() if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 for epoch in range(opt.epoch): trainer.reset_meters() for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() img, bbox, label = Variable(img), Variable(bbox), Variable(label) trainer.train_step(img, bbox, label, scale) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch == 9: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) if epoch == 13: break
def predict(self, imgs, sizes=None, visualize=False): """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() if visualize: self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = at.totensor(img[None]).float() scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_scores = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = torch.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = torch.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4)
def train(**kwargs): opt._parse(kwargs) dataset = Dataset(opt) print('load data') dataloader = data_.DataLoader(dataset, batch_size=1, shuffle=True, # pin_memory=True, num_workers=opt.num_workers) testset = TestDataset(opt) test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, pin_memory=True ) testset_all = TestDataset_all(opt, 'test2') test_all_dataloader = data_.DataLoader(testset_all, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, pin_memory=True ) tsf = Transform(opt.min_size, opt.max_size) faster_rcnn = FasterRCNNVGG16() trainer = FasterRCNNTrainer(faster_rcnn).cuda() print('model construct completed') # 加载训练过的模型,在config配置路径就可以了 if opt.load_path: trainer.load(opt.load_path) print('load pretrained model from %s' % opt.load_path) #提取蒸馏知识所需要的软标签 if opt.is_distillation == True: opt.predict_socre = 0.3 for ii, (imgs, sizes, gt_bboxes_, gt_labels_, scale, id_) in tqdm(enumerate(dataloader)): if len(gt_bboxes_) == 0: continue sizes = [sizes[0][0].item(), sizes[1][0].item()] pred_bboxes_, pred_labels_, pred_scores_, features_ = trainer.faster_rcnn.predict(imgs, [ sizes]) img_file = os.path.join( opt.voc_data_dir, 'JPEGImages', id_[0] + '.jpg') ori_img = read_image(img_file, color=True) img, pred_bboxes_, pred_labels_, scale_ = tsf( (ori_img, pred_bboxes_[0], pred_labels_[0])) #去除软标签和真值标签重叠过多的部分,去除错误的软标签 pred_bboxes_, pred_labels_, pred_scores_ = py_cpu_nms( gt_bboxes_[0], gt_labels_[0], pred_bboxes_, pred_labels_, pred_scores_[0]) #存储软标签,这样存储不会使得GPU占用过多 np.save('label/' + str(id_[0]) + '.npy', pred_labels_) np.save('bbox/' + str(id_[0]) + '.npy', pred_bboxes_) np.save('feature/' + str(id_[0]) + '.npy', features_) np.save('score/' + str(id_[0]) + '.npy', pred_scores_) opt.predict_socre = 0.05 t.cuda.empty_cache() # visdom 显示所有类别标签名 trainer.vis.text(dataset.db.label_names, win='labels') best_map = 0 lr_ = opt.lr for epoch in range(opt.epoch): print('epoch=%d' % epoch) # 重置混淆矩阵 trainer.reset_meters() # tqdm可以在长循环中添加一个进度提示信息,用户只需要封装任意的迭代器 tqdm(iterator), # 是一个快速、扩展性强 for ii, (img, sizes, bbox_, label_, scale, id_) in tqdm(enumerate(dataloader)): if len(bbox_) == 0: continue scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() # 训练的就这一步 下面的都是打印的信息 # 转化成pytorch能够计算的格式,转tensor格式 if opt.is_distillation == True: #读取软标签 teacher_pred_labels = np.load( 'label/' + str(id_[0]) + '.npy') teacher_pred_bboxes = np.load( 'bbox/' + str(id_[0]) + '.npy') teacher_pred_features_ = np.load( 'feature/' + str(id_[0]) + '.npy') teacher_pred_scores = np.load( 'score/' + str(id_[0]) + '.npy') #格式转换 teacher_pred_bboxes = teacher_pred_bboxes.astype(np.float32) teacher_pred_labels = teacher_pred_labels.astype(np.int32) teacher_pred_scores = teacher_pred_scores.astype(np.float32) #转成pytorch格式 teacher_pred_bboxes_ = at.totensor(teacher_pred_bboxes) teacher_pred_labels_ = at.totensor(teacher_pred_labels) teacher_pred_scores_ = at.totensor(teacher_pred_scores) teacher_pred_features_ = at.totensor(teacher_pred_features_) #使用GPU teacher_pred_bboxes_ = teacher_pred_bboxes_.cuda() teacher_pred_labels_ = teacher_pred_labels_.cuda() teacher_pred_scores_ = teacher_pred_scores_.cuda() teacher_pred_features_ = teacher_pred_features_.cuda() # 如果dataset.py 中的Transform 设置了图像翻转,就要使用这个判读软标签是否一起翻转 if(teacher_pred_bboxes_[0][1] != bbox[0][0][1]): _, o_C, o_H, o_W = img.shape teacher_pred_bboxes_ = flip_bbox( teacher_pred_bboxes_, (o_H, o_W), x_flip=True) losses = trainer.train_step(img, bbox, label, scale, epoch, teacher_pred_bboxes_, teacher_pred_labels_, teacher_pred_features_, teacher_pred_scores) else: trainer.train_step(img, bbox, label, scale, epoch) # visdom显示的信息 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) gt_img = visdom_bbox(ori_img_, at.tonumpy(teacher_pred_bboxes_), at.tonumpy(teacher_pred_labels_), at.tonumpy(teacher_pred_scores_)) trainer.vis.img('gt_img_all', gt_img) # plot predicti bboxes _bboxes, _labels, _scores, _ = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # 混淆矩阵 # rpn confusion matrix(meter) trainer.vis.text( str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.text( str(trainer.roi_cm.value().tolist()), win='roi_cm') # trainer.vis.img('roi_cm', at.totensor( # trainer.roi_cm.value(), False).float()) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{},ap:{}, map:{},loss:{}'.format(str(lr_), str(eval_result['ap']), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) # 保存最好结果并记住路径 if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save(best_map=best_map) if epoch == 20: trainer.save(best_map='20') result = eval(test_all_dataloader, trainer.faster_rcnn, test_num=5000) print('20result={}'.format(str(result))) # trainer.load(best_path) # result=eval(test_all_dataloader,trainer.faster_rcnn,test_num=5000) # print('bestmapresult={}'.format(str(result))) break # 每10轮加载前面最好权重,并且减少学习率 if epoch % 20 == 15: trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) lr_ = lr_ * opt.lr_decay
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) features = self.faster_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.totensor(gt_rpn_label).long() gt_rpn_loc = at.totensor(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] gt_roi_label = at.totensor(gt_roi_label).long() gt_roi_loc = at.totensor(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) # weight = t.Tensor([10,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]).cuda() roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def train(**kwargs): # *变量名, 表示任何多个无名参数, 它是一个tuple;**变量名, 表示关键字参数, 它是一个dict opt._parse(kwargs) # 识别参数,传递过来的是一个字典,用parse来解析 dataset = Dataset(opt) # 作者自定义的Dataset类 print('读取数据中...') # Dataloader 定义了一次获取批次数据的方法 dataloader = data_.DataLoader(dataset, \ batch_size=1, \ shuffle=True, \ # pin_memory=True, num_workers=opt.num_workers) # PyTorch自带的DataLoader类,生成一个多线程迭代器来迭代dataset, 以供读取一个batch的数据 testset = TestDataset(opt, split='trainval') # 测试集loader test_dataloader = data_.DataLoader(testset, batch_size=1, num_workers=opt.test_num_workers, shuffle=False, \ pin_memory=True ) faster_rcnn = FasterRCNNVGG16() # 网络定义 print('模型构建完毕!') trainer = FasterRCNNTrainer( faster_rcnn).cuda() # 定义一个训练器,返回loss, .cuda()表示把返回的Tensor存入GPU if opt.load_path: # 如果要加载预训练模型 trainer.load(opt.load_path) print('已加载预训练参数 %s' % opt.load_path) else: print("未引入预训练参数, 随机初始化网络参数") trainer.vis.text(dataset.db.label_names, win='labels') # 显示labels标题 best_map = 0 # 定义一个best_map for epoch in range(opt.epoch): # 对于每一个epoch trainer.reset_meters() # 重置测各种测量仪 # 对每一个数据 for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)): scale = at.scalar(scale) # 转化为标量 img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda( ) # 存入GPU img, bbox, label = Variable(img), Variable(bbox), Variable( label) # 转换成变量以供自动微分器使用 # TODO trainer.train_step(img, bbox, label, scale) # 训练一步 if (ii + 1) % opt.plot_every == 0: # 如果到达"每多少次显示" if os.path.exists(opt.debug_file): ipdb.set_trace() # plot loss trainer.vis.plot_many(trainer.get_meter_data()) # plot groud truth bboxes ori_img_ = inverse_normalize(at.tonumpy(img[0])) gt_img = visdom_bbox(ori_img_, at.tonumpy(bbox_[0]), at.tonumpy(label_[0])) trainer.vis.img('gt_img', gt_img) # plot predicti bboxes _bboxes, _labels, _scores = trainer.faster_rcnn.predict( [ori_img_], visualize=True) pred_img = visdom_bbox(ori_img_, at.tonumpy(_bboxes[0]), at.tonumpy(_labels[0]).reshape(-1), at.tonumpy(_scores[0])) trainer.vis.img('pred_img', pred_img) # rpn confusion matrix(meter) trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm') # roi confusion matrix trainer.vis.img( 'roi_cm', at.totensor(trainer.roi_cm.conf, False).float()) # 使用测试数据集来评价模型(此步里面包含预测信息) eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num) if eval_result['map'] > best_map: best_map = eval_result['map'] best_path = trainer.save( best_map=best_map) # 好到一定程度就存储模型, 存储在checkpoint文件夹内 if epoch == 9: # 到第9轮的时候读取模型, 并调整学习率 trainer.load(best_path) trainer.faster_rcnn.scale_lr(opt.lr_decay) trainer.vis.plot('test_map', eval_result['map']) lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr'] log_info = 'lr:{}, map:{},loss:{}'.format( str(lr_), str(eval_result['map']), str(trainer.get_meter_data())) trainer.vis.log(log_info) # if epoch == 13: # 到第14轮的时候停止训练 # break trainer.save(best_map=best_map)
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) features = self.faster_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head( features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss( rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss( roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def predict(self, imgs,sizes=None,visualize=False): """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() if visualize: self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores