side_loss_list.append(total_o_reg_loss) total_loss = 0 total_cls_loss = 0 total_v_reg_loss = 0 total_o_reg_loss = 0 start_time = time.time() # 定期验证模型性能 if iteration % val_iter == 0: net.eval() with torch.no_grad(): logger.info( 'Start evaluate at {0} epoch {1} iteration.'.format( i, iteration)) val_loss = evaluate.val(net, criterion, val_batch_size, using_cuda, logger, val_im_list) # 验证集评估 logger.info('End evaluate.') net.train() start_time = time.time() test_loss_list.append(val_loss) draw_loss_plot(train_loss_list, test_loss_list) train_loss_plot(clc_loss_list, v_loss_list, side_loss_list) #定期存储模型 # if iteration % save_iter == 0 : # print('Model saved at ./model/ctpn-{0}-{1}.model'.format(i, iteration)) # torch.save(net.state_dict(), os.path.join(MODEL_SAVE_PATH, 'ctpn-msra_ali-{0}-{1}.model'.format(i, iteration))) #torch.cuda.empty_cache() state = { 'net': net.state_dict(),
writer.add_scalar('G_loss_total/inte_loss', inte_l, global_step=step) writer.add_scalar('G_loss_total/grad_loss', grad_l, global_step=step) writer.add_scalar('psnr/train_psnr', psnr, global_step=step) if step % int(train_cfg.iters / 100) == 0: writer.add_image('image/G_frame', save_G_frame, global_step=step) writer.add_image('image/target', save_target, global_step=step) if step % train_cfg.save_interval == 0: model_dict = {'net_g': generator.state_dict(), 'optimizer_g': optimizer_G.state_dict(), 'net_d': discriminator.state_dict(), 'optimizer_d': optimizer_D.state_dict()} torch.save(model_dict, f'weights/{train_cfg.dataset}_{step}.pth') print(f'\nAlready saved: \'{train_cfg.dataset}_{step}.pth\'.') if step % train_cfg.val_interval == 0: auc = val(train_cfg, model=generator) writer.add_scalar('results/auc', auc, global_step=step) generator.train() step += 1 if step > train_cfg.iters: training = False model_dict = {'net_g': generator.state_dict(), 'optimizer_g': optimizer_G.state_dict(), 'net_d': discriminator.state_dict(), 'optimizer_d': optimizer_D.state_dict()} torch.save(model_dict, f'weights/latest_{train_cfg.dataset}_{step}.pth') break except KeyboardInterrupt: print(f'\nStop early, model saved: \'latest_{train_cfg.dataset}_{step}.pth\'.\n') if glob(f'weights/latest*'):
def train(): net = ctpn.CTPN() for name, value in net.named_parameters(): if name in no_grad: value.requires_grad = False else: value.requires_grad = True utils.init_weight(net) if using_cuda: net.cuda() net.train() criterion = LOSS.CTPN_Loss(using_cuda=using_cuda) full_data = OCRD('./data/easy/pic/', './data/easy/ocr.json') train_size = int(0.8 * len(full_data)) val_size = len(full_data) - train_size train_data, val_data = torch.utils.data.random_split(full_data, [train_size, val_size]) train_loader = DataLoader(train_data, 1, shuffle=True) val_loader = DataLoader(val_data, 1, shuffle=True) trian_loss_list = [] test_loss_list = [] for i in range(epoch): if i > epoch_change: lr = lr_behind else: lr = lr_front optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005) iteration = 0 total_loss = 0 total_cls_loss = 0 total_v_reg_loss = 0 total_o_reg_loss = 0 total_iter = len(train_loader) start = time.time() for img, tag, filename in train_loader: # img_pil = Image.open(filename[0]) train_logger.info("this is {} epoch {} iteration".format(i, iteration)) tensor_img = img.permute((0, 3, 1, 2)) img = torch.squeeze(img, 0) if using_cuda: tensor_img = tensor_img.to(dtype=torch.float).cuda() else: tensor_img = tensor_img.to(dtype=torch.float) vertical_pred, score, side_refinement = net(tensor_img) del tensor_img positive = [] negative = [] vertical_reg = [] side_refinement_reg = [] try: for box in tag: gt_anchor = generate_anchor(img, box) positive1, negative1, vertical_reg1, side_refinement_reg1 = tag_anchor(gt_anchor, score, box) positive += positive1 negative += negative1 vertical_reg += vertical_reg1 side_refinement_reg += side_refinement_reg1 except Exception as e: train_logger.warn("the error is %s" %e) train_logger.warn("warning: img %s raise error" % filename) iteration += 1 continue if len(vertical_reg) == 0 or len(positive) == 0 or len(side_refinement_reg) == 0: iteration += 1 continue optimizer.zero_grad() loss, cls_loss, v_reg_loss, o_reg_loss = criterion(score, vertical_pred, side_refinement, positive, negative, vertical_reg, side_refinement_reg) loss.backward() optimizer.step() iteration += 1 total_loss += float(loss) total_cls_loss += float(cls_loss) total_v_reg_loss += float(v_reg_loss) total_o_reg_loss += float(o_reg_loss) if iteration % display_iter == 0: end = time.time() total_time = start - end train_logger.info('Epoch: {2}/{3}, Iteration: {0}/{1}, loss: {4}, cls_loss: {5}, v_reg_loss: {6}, o_reg_loss: {7}, {8}'. format(iteration, total_iter, i, epoch, total_loss / display_iter, total_cls_loss / display_iter, total_v_reg_loss / display_iter, total_o_reg_loss / display_iter, filename)) trian_loss_list.append(total_loss) total_loss = 0 total_cls_loss = 0 total_v_reg_loss = 0 total_o_reg_loss = 0 start = time.time() if iteration % val_iter == 0: net.eval() train_logger.info("start evaluate at {} epoch {} iteration".format(i, iteration)) val_loss = evaluate.val(net, criterion, val_batch_size, using_cuda, train_logger, val_loader) train_logger.info('End evaluate.') net.train() start_time = time.time() test_loss_list.append(val_loss) train_logger.info('Model saved at ./output/ctpn-{0}-end.model'.format(i)) torch.save(net.state_dict(), os.path.join(MODEL_SAVE_PATH, 'ctpn-msra_ali-{0}-end.model'.format(i)))
def train(self): step = self.start_iter writer = SummaryWriter( f'tensorboard_log/{self.dataset}_bs{self.batch_size}') training = True self.generator.train() self.discriminator.train() for indice, clips, flow_strs in self.train_dataloader: print() print(step) input_frames = clips[:, 0:12, :, :].cuda() # (n, 12, 256, 256) target_frame = clips[:, 12:15, :, :].cuda() # (n, 3, 256, 256) input_last = input_frames[:, 9:12, :, :].cuda() # use for flow_loss ## pop() the used frame index, this can't work in train_dataset.__getitem__ because of multiprocessing. #for index in indice: # self.train_dataset.all_seqs[index].pop() # if len(self.train_dataset.all_seqs[index]) == 0: # self.train_dataset.all_seqs[index] = list(range(len(train_dataset.videos[index]) - 4)) # random.shuffle(self.train_dataset.all_seqs[index]) print(input_frames) G_frame, flow_gt, flow_pred = self.model.forward( input_frames, target_frame, input_last) if self.show_flow: flow = np.array(flow_gt.cpu().detach().numpy().transpose( 0, 2, 3, 1), np.float32) # to (n, w, h, 2) for i in range(flow.shape[0]): aa = flow_to_color(flow[i], convert_to_bgr=False) path = self.train_data.split('/')[-3] + '_' + flow_strs[i] cv2.imwrite(f'images/{path}.jpg', aa) # e.g. images/avenue_4_574-575.jpg print( f'Saved a sample optic flow image from gt frames: \'images/{path}.jpg\'.' ) inte_l = self.intensity_loss(G_frame, target_frame) grad_l = self.gradient_loss(G_frame, target_frame) fl_l = self.flow_loss(flow_pred, flow_gt) g_l = self.adversarial_loss(self.discriminator(G_frame)) G_l_t = 1. * inte_l + 1. * grad_l + 2. * fl_l + 0.05 * g_l # When training discriminator, don't train generator, so use .detach() to cut off gradients. D_l = self.discriminate_loss(self.discriminator(target_frame), self.discriminator(G_frame.detach())) # https://github.com/pytorch/pytorch/issues/39141 # torch.optim optimizer now do inplace detection for module parameters since PyTorch 1.5 # If I do this way: # ---------------------------------------- # optimizer_D.zero_grad() # D_l.backward() # optimizer_D.step() # optimizer_G.zero_grad() # G_l_t.backward() # optimizer_G.step() # ---------------------------------------- # The optimizer_D.step() modifies the discriminator parameters inplace. # But these parameters are required to compute the generator gradient for the generator. # Thus I should make sure no parameters are modified before calling .step(), like this: # ---------------------------------------- # optimizer_G.zero_grad() # G_l_t.backward() # optimizer_G.step() # optimizer_D.zero_grad() # D_l.backward() # optimizer_D.step() # ---------------------------------------- # Or just do .step() after all the gradients have been computed, like the following way: self.optimizer_D.zero_grad() D_l.backward() self.optimizer_G.zero_grad() G_l_t.backward() self.optimizer_D.step() self.optimizer_G.step() torch.cuda.synchronize() time_end = time.time() if step > self.start_iter: # This doesn't include the testing time during training. iter_t = time_end - temp temp = time_end if step != self.start_iter: if step % 20 == 0: time_remain = (self.iters - step) * iter_t eta = str( datetime.timedelta(seconds=time_remain)).split('.')[0] psnr = psnr_error(G_frame, target_frame) lr_g = self.optimizer_G.param_groups[0]['lr'] lr_d = self.optimizer_D.param_groups[0]['lr'] print( f"[{step}] inte_l: {inte_l:.3f} | grad_l: {grad_l:.3f} | fl_l: {fl_l:.3f} | " f"g_l: {g_l:.3f} | G_l_total: {G_l_t:.3f} | D_l: {D_l:.3f} | psnr: {psnr:.3f} | " f"iter: {iter_t:.3f}s | ETA: {eta} | lr: {lr_g} {lr_d}" ) save_G_frame = ((G_frame[0] + 1) / 2) save_G_frame = save_G_frame.cpu().detach()[(2, 1, 0), ...] save_target = ((target_frame[0] + 1) / 2) save_target = save_target.cpu().detach()[(2, 1, 0), ...] writer.add_scalar('psnr/train_psnr', psnr, global_step=step) writer.add_scalar('total_loss/g_loss_total', G_l_t, global_step=step) writer.add_scalar('total_loss/d_loss', D_l, global_step=step) writer.add_scalar('G_loss_total/g_loss', g_l, global_step=step) writer.add_scalar('G_loss_total/fl_loss', fl_l, global_step=step) writer.add_scalar('G_loss_total/inte_loss', inte_l, global_step=step) writer.add_scalar('G_loss_total/grad_loss', grad_l, global_step=step) writer.add_scalar('psnr/train_psnr', psnr, global_step=step) if step % int(self.iters / 100) == 0: writer.add_image('image/G_frame', save_G_frame, global_step=step) writer.add_image('image/target', save_target, global_step=step) if step % self.save_interval == 0: model_dict = { 'net_g': self.generator.state_dict(), 'optimizer_g': self.optimizer_G.state_dict(), 'net_d': self.discriminator.state_dict(), 'optimizer_d': self.optimizer_D.state_dict() } torch.save(model_dict, f'weights/{self.dataset}_{step}.pth') print(f'\nAlready saved: \'{self.dataset}_{step}.pth\'.') if step % self.val_interval == 0: auc = val(model=self.generator) writer.add_scalar('results/auc', auc, global_step=step) self.generator.train() step += 1 if step > self.iters: training = False model_dict = { 'net_g': self.generator.state_dict(), 'optimizer_g': self.optimizer_G.state_dict(), 'net_d': self.discriminator.state_dict(), 'optimizer_d': self.optimizer_D.state_dict() } torch.save(model_dict, f'weights/latest_{self.dataset}_{step}.pth') print('gx12313213') break