def train(train_img_path, train_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval, output_dir): # 为CPU设置种子用于生成随机数,以使得结果是确定的 torch.manual_seed(970201) # 为CPU设置随机种子 torch.cuda.manual_seed(970201) # 为当前GPU设置随机种子 logger = setup_logger("east_matrix", output_dir, get_rank()) file_num = len(os.listdir(train_img_path)) # 图片数量 trainset = custom_dataset(train_img_path, train_gt_path) # 训练集进行处理 ??? *** # 加载数据,组合一个数据集和一个采样器,并在给定的数据集上提供一个可迭代的。 train_loader = data.DataLoader(trainset, batch_size=batch_size, \ shuffle=True, num_workers=num_workers, drop_last=True) criterion = Loss() # 损失函数 ??? *** device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = EAST() # 网络模型 ??? *** # 是否多gpu data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True # 分配模型到gpu或cpu,根据device决定 model.to(device) #优化器 optimizer = torch.optim.Adam(model.parameters(), lr=lr) # 学习率衰减策略,一半的时候衰减为十分之一 scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[epoch_iter//2], gamma=0.1)
def train(train_img_path, train_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval, pretrained_pth): file_num = len(os.listdir(train_img_path)) trainset = custom_dataset(train_img_path, train_gt_path) train_loader = data.DataLoader(trainset, batch_size=batch_size, \ shuffle=True, num_workers=num_workers, drop_last=True) criterion = Loss() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = EAST() # if pretrained_path: # model.load_state_dict(torch.load(pretrained_path)) data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[epoch_iter // 2], gamma=0.1) for epoch in range(epoch_iter): model.train() scheduler.step() epoch_loss = 0 epoch_time = time.time() for i, (img, gt_score, gt_geo, ignored_map) in enumerate(train_loader): start_time = time.time() img, gt_score, gt_geo, ignored_map = img.to(device), gt_score.to( device), gt_geo.to(device), ignored_map.to(device) pred_score, pred_geo = model(img) loss = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() print('Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format(\ epoch+1, epoch_iter, i+1, int(file_num/batch_size), time.time()-start_time, loss.item())) print('epoch_loss is {:.8f}, epoch_time is {:.8f}'.format( epoch_loss / int(file_num / batch_size), time.time() - epoch_time)) print(time.asctime(time.localtime(time.time()))) print('=' * 50) if (epoch + 1) % interval == 0: state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() torch.save( state_dict, os.path.join(pths_path, 'model_epoch_{}.pth'.format(epoch + 1)))
def main(): config = Config() if os.path.exists(config.SAVE_PATH): shutil.rmtree(config.SAVE_PATH) os.makedirs(config.SAVE_PATH, exist_ok=True) trainF = open(os.path.join(config.SAVE_PATH, "train.csv"), 'w') testF = open(os.path.join(config.SAVE_PATH, "test.csv"), 'w') train_img_path = os.path.abspath('../ICDAR_2015/train_img') train_gt_path = os.path.abspath('../ICDAR_2015/train_gt') val_img_path = os.path.abspath('../ICDAR_2015/test_img') val_gt_path = os.path.abspath('../ICDAR_2015/test_gt') kwargs = {'num_workers': 2, 'pin_memory': True} if torch.cuda.is_available() else {} train_dataset = custom_dataset(train_img_path, train_gt_path) train_loader = data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH*len(device_list), \ shuffle=True, drop_last=True, **kwargs) val_dataset = custom_dataset(val_img_path, val_gt_path) val_loader = data.DataLoader(val_dataset, batch_size=config.TRAIN_BATCH*len(device_list), \ shuffle=True, drop_last=True, **kwargs) net = EAST() if torch.cuda.is_available(): net = net.cuda(device=device_list[0]) net = torch.nn.DataParallel(net, device_ids=device_list) optimizer = torch.optim.Adam(net.parameters(), lr=config.BASE_LR, weight_decay=config.WEIGHT_DECAY) for epoch in range(config.EPOCHS): train(net, epoch, train_loader, optimizer, trainF, config) test(net, epoch, val_loader, testF, config) if epoch != 0 and epoch % config.SAVE_INTERVAL == 0: torch.save({'state_dict': net.state_dict()}, os.path.join(os.getcwd(), config.SAVE_PATH, "laneNet{}.pth.tar".format(epoch))) trainF.close() testF.close() torch.save({'state_dict': net.state_dict()}, os.path.join(os.getcwd(), config.SAVE_PATH, "finalNet.pth.tar"))
def train(train_ds_path, val_ds_path, pths_path, results_path, batch_size, lr, num_workers, train_iter, interval, opt_level=0, checkpoint_path=None, val_freq=10): torch.cuda.set_device(rank) tensorboard_dir = os.path.join(results_path, 'logs') checkpoints_dir = os.path.join(results_path, 'checkpoints') if rank == 0: os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(checkpoints_dir, exist_ok=True) barrier() try: logger.info('Importing AutoResume lib...') from userlib.auto_resume import AutoResume as auto_resume auto_resume.init() logger.info('Success!') except: logger.info('Failed!') auto_resume = None trainset = custom_dataset( os.path.join(train_ds_path, 'images'), os.path.join(train_ds_path, 'gt'), ) valset = custom_dataset(os.path.join(val_ds_path, 'images'), os.path.join(val_ds_path, 'gt'), is_val=True) logger.info(f'World Size: {world_size}, Rank: {rank}') if world_size > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( trainset) val_sampler = torch.utils.data.distributed.DistributedSampler( valset, shuffle=False) else: train_sampler = None val_sampler = None worker_init = LoaderWorkerProcessInit(rank, 43) train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=train_sampler is None, sampler=train_sampler, num_workers=num_workers, pin_memory=True, drop_last=True, worker_init_fn=worker_init) val_loader = DataLoader(valset, batch_size=batch_size, shuffle=False, sampler=val_sampler, num_workers=num_workers, pin_memory=True, drop_last=True, worker_init_fn=worker_init) criterion = Loss() device = torch.device( f"cuda:{rank}" if torch.cuda.is_available() else "cpu") model = EAST() model.to(device) model = apex.parallel.convert_syncbn_model(model) optimizer = torch.optim.Adam(model.parameters(), lr=lr) model, optimizer = amp.initialize(model, optimizer, opt_level=f'O{opt_level}') start_iter = 0 if auto_resume is not None: auto_resume_details = auto_resume.get_resume_details() if auto_resume_details is not None: logger.info( 'Detected that this is a resumption of a previous job!') checkpoint_path = auto_resume_details['CHECKPOINT_PATH'] if checkpoint_path: logger.info(f'Loading checkpoint at path "{checkpoint_path}"...') checkpoint = torch.load(checkpoint_path, map_location=f'cuda:{rank}') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) amp.load_state_dict(checkpoint['amp_state']) start_iter = checkpoint['iter'] logger.info('Done') data_parallel = False main_model = model if torch.distributed.is_initialized(): logger.info( f'DataParallel: Using {torch.cuda.device_count()} devices!') model = DDP(model) data_parallel = True for param_group in optimizer.param_groups: param_group.setdefault('initial_lr', lr) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[train_iter // 2], gamma=0.1, last_epoch=start_iter) # This allows us to change dataset size without affecting things such as validation frequency steps_per_epoch = 1000 // (world_size * batch_size) step = start_iter start_epoch = step // steps_per_epoch epoch_iter = int(math.ceil(train_iter / steps_per_epoch)) if rank == 0: logger.info('Initializing Tensorboard') writer = SummaryWriter(tensorboard_dir, purge_step=step) loss_meters = MeterDict(reset_on_value=True) val_loss_meters = MeterDict(reset_on_value=True) time_meters = MeterDict(reset_on_value=True) logger.info('Training') model.train() train_start_time = time.time() best_loss = 100 train_iter = [iter(train_loader)] def get_batch(): try: return next(train_iter[0]) except: train_iter[0] = iter(train_loader) return get_batch() for epoch in range(start_epoch, epoch_iter): if train_sampler is not None: train_sampler.set_epoch(epoch) epoch_loss = 0 epoch_time = time.time() start_time = time.time() model.train() for i in range(steps_per_epoch): batch = get_batch() optimizer.zero_grad() batch = [b.cuda(rank, non_blocking=True) for b in batch] img, gt_score, gt_geo, ignored_map = batch barrier() time_meters['batch_time'].add_sample(time.time() - start_time) pred_score, pred_geo = model(img) loss, details = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += loss.detach().item() with amp.scale_loss(loss, optimizer) as loss_scaled: loss_scaled.backward() optimizer.step() barrier() time_meters['step_time'].add_sample(time.time() - start_time) details['global'] = loss.detach().item() for k, v in details.items(): loss_meters[k].add_sample(v) if i % 10 == 0: logger.info(f'\tStep [{i+1}/{steps_per_epoch}]') start_time = time.time() step += 1 scheduler.step() if step == train_iter: break term_requested = auto_resume is not None and auto_resume.termination_requested( ) checkpoint_path = None if rank == 0: times = {k: m.value() for k, m in time_meters.items()} losses = {k: m.value() for k, m in loss_meters.items()} times['epoch'] = time.time() - epoch_time logger.info( f'Epoch is [{epoch+1}/{epoch_iter}], time consumption is {times}, batch_loss is {losses}' ) for k, v in times.items(): writer.add_scalar(f'performance/{k}', v, step) for k, v in losses.items(): writer.add_scalar(f'loss/{k}', v, step) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], step) if term_requested or (epoch + 1) % interval == 0: state_dict = main_model.state_dict() optim_state = optimizer.state_dict() checkpoint_path = os.path.join( checkpoints_dir, 'model_epoch_{}.pth'.format(epoch + 1)) logger.info(f'Saving checkpoint to "{checkpoint_path}"...') torch.save( { 'model': state_dict, 'optimizer': optim_state, 'amp_state': amp.state_dict(), 'epoch': epoch + 1, 'iter': step }, checkpoint_path) logger.info(f'Done') if (epoch + 1) % val_freq == 0 or step == train_iter: logger.info(f'Validating epoch {epoch+1}...') model.eval() val_loader.dataset.reset_random() with torch.no_grad(): for i, batch in enumerate(val_loader): batch = [b.cuda(rank, non_blocking=True) for b in batch] img, gt_score, gt_geo, ignored_map = batch barrier() pred_score, pred_geo = model(img) loss, details = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) details['global'] = loss.detach().item() barrier() for k, v in details.items(): val_loss_meters[k].add_sample(v) print_dict = dict() for k, m in val_loss_meters.items(): t = torch.tensor(m.value(), device=f'cuda:{rank}', dtype=torch.float32) if world_size > 1: torch.distributed.reduce(t, 0) t /= world_size if rank == 0: writer.add_scalar(f'val/loss/{k}', t.item(), step) print_dict[k] = t.item() logger.info(f'\tLoss: {print_dict}') val_loss = print_dict['global'] if rank == 0 and val_loss < best_loss: logger.info( f'This is the best model so far. New loss: {val_loss}, previous: {best_loss}' ) best_loss = val_loss shutil.copyfile(checkpoint_path, os.path.join(checkpoints_dir, 'best.pth')) logger.info('Training') if term_requested: logger.warning('Termination requested! Exiting...') if rank == 0: auto_resume.request_resume(user_dict={ 'CHECKPOINT_PATH': save_path, 'EPOCH': epoch }) break logger.info( f'Finished training!!! Took {time.time()-train_start_time:0.3f} seconds!' )
def train(train_img_path, train_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval): file_num = len(os.listdir(train_img_path)) trainset = custom_dataset(train_img_path, train_gt_path) train_loader = data.DataLoader(trainset, batch_size=batch_size, \ shuffle=True, num_workers=num_workers, drop_last=True) criterion = Loss() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = EAST(pretrained=False) model.load_state_dict( torch.load( '/home/chen-ubuntu/Desktop/checks_dataset/pths/model_epoch_mode3_14.pth' )) data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) optimizer.zero_grad() #scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[epoch_iter//2], gamma=0.1) for epoch in range(epoch_iter): model.train() epoch_loss = 0 epoch_time = time.time() loss_plot = [] bx = [] for i, (img, gt_score, gt_geo, ignored_map) in enumerate(train_loader): start_time = time.time() img, gt_score, gt_geo, ignored_map = img.to(device), gt_score.to( device), gt_geo.to(device), ignored_map.to(device) pred_score, pred_geo = model(img) loss = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += loss.item() loss.backward() if (i + 1) % 3: optimizer.step() optimizer.zero_grad() if (i + 1) % 100 == 0: print( 'Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}' .format(epoch + 1, epoch_iter, i + 1, int(file_num / batch_size), time.time() - start_time, loss.item())) if (i + 1) % 100 == 0: loss_plot.append(loss.item()) bx.append(i + epoch * int(file_num / batch_size)) plt.plot(bx, loss_plot, label='loss_mean', linewidth=1, color='b', marker='o', markerfacecolor='green', markersize=2) plt.savefig(os.path.abspath('./labeled.jpg')) print('epoch_loss is {:.8f}, epoch_time is {:.8f}'.format( epoch_loss / int(file_num / batch_size), time.time() - epoch_time)) print(time.asctime(time.localtime(time.time()))) print('=' * 50) if (epoch + 1) % interval == 0: state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() torch.save( state_dict, os.path.join(pths_path, 'model3_epoch_{}.pth'.format(epoch + 1 + 14)))
'loss': losses.avg, 'pred': pred_meter.avg }, queue if __name__ == '__main__': args = parse_option() image_size, mean, std = dataset_info(name='cifar') # image_size = 28 # mean = [0.1307, ] # std = [0.3081, ] # normalize = transforms.Normalize(mean=mean, std=std) train_transform = get_transform(image_size, mean=mean, std=std, mode='train') # datasets.mnist.MNIST train_dataset = custom_dataset(datasets.cifar.CIFAR10)(root='./', train=True, transform=train_transform, download=True) print(len(train_dataset)) train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=False, drop_last=True) # drop the last batch due to irregular size model_q, model_k = get_model(config.MODEL) optimizer = torch.optim.SGD(model_q.parameters(), lr=0.02, momentum=0.9, nesterov=True, weight_decay=1e-5) per = config.ALL_EPOCHS // 6 scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[per * 2, per * 4, per * 5], gamma=0.1) # copy parameters from model_q to model_k momentum_update(model_q, model_k, 0) criterion = torch.nn.CrossEntropyLoss()
def train(source_img_path, source_gt_path, target_img_path, target_gt_path, valid_img_path, valid_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval, pretrain_model_path=None, scheduler_path=None, current_epoch_num=0): if not os.path.exists(pths_path): os.mkdir(pths_path) # source_train_set = IC13_dataset(source_img_path, source_gt_path) source_train_set = custom_dataset(source_img_path, source_gt_path) target_train_set = custom_dataset(target_img_path, target_gt_path) valid_train_set = valid_dataset(valid_img_path, valid_gt_path) source_train_loader = data.DataLoader(source_train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) target_train_loader = data.DataLoader(target_train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) valid_loader = data.DataLoader(valid_train_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False) criterion = Loss().to(device) loss_domain = torch.nn.CrossEntropyLoss() model = EAST() if None != pretrain_model_path: model.load_state_dict(torch.load(pretrain_model_path)) data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[epoch_iter // 3, epoch_iter * 2 // 3], gamma=0.1) if None != scheduler_path: scheduler.load_state_dict(torch.load(scheduler_path)) best_loss = 1000 best_model_wts = copy.deepcopy(model.state_dict()) best_num = 0 train_loss = [] valid_loss = [] for epoch in range(current_epoch_num, epoch_iter): model.train() target_train_iter = iter(target_train_loader) epoch_loss = 0 epoch_time = time.time() for i, (s_img, s_gt_score, s_gt_geo, s_ignored_map) in enumerate(source_train_loader): start_time = time.time() try: t_img, t_gt_score, t_gt_geo, t_ignored_map = next( target_train_iter) except StopIteration: target_train_iter = iter(source_train_loader) t_img, t_gt_score, t_gt_geo, t_ignored_map = next( target_train_iter) s_img, s_gt_score, s_gt_geo, s_ignored_map = s_img.to( device), s_gt_score.to(device), s_gt_geo.to( device), s_ignored_map.to(device) pred_score, pred_geo, pred_cls = model(s_img, False) #source label domain_s = Variable(torch.zeros(pred_cls.size(0)).long().cuda()) loss_domain_s = loss_domain(pred_cls, domain_s) target_cls = model(t_img, True) # target label domain_t = Variable(torch.ones(pred_cls.size(0)).long().cuda()) loss_domain_t = loss_domain(target_cls, domain_t) loss = criterion(s_gt_score, pred_score, s_gt_geo, pred_geo, s_ignored_map) + loss_domain_s + loss_domain_t optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.item() # print('Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format( \ # epoch + 1, epoch_iter, i + 1, int(len(source_train_loader)), time.time() - start_time, loss.item())) epoch_loss_mean = epoch_loss / len(source_train_loader) train_loss.append(epoch_loss_mean) print('Epoch[{}], Train, epoch_loss is {:.8f}, epoch_time is {:.8f}'. format(epoch, epoch_loss_mean, time.time() - epoch_time)) val_epoch_loss = eval(model, valid_loader, criterion, epoch) val_loss_mean = val_epoch_loss / len(valid_loader) valid_loss.append(val_loss_mean) print(time.asctime(time.localtime(time.time()))) print('=' * 50) if val_loss_mean < best_loss: best_num = epoch + 1 best_loss = val_loss_mean best_model_wts = copy.deepcopy(model.state_dict()) # save best model print('best model num:{}, best loss is {:.8f}'.format( best_num, best_loss)) torch.save(best_model_wts, os.path.join(pths_path, 'model_epoch_best.pth')) if (epoch + 1) % interval == 0: savePath = pths_path + 'lossImg' + str(epoch + 1) + '.jpg' drawLoss(train_loss, valid_loss, savePath) print(time.asctime(time.localtime(time.time()))) state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() lr_state = scheduler.state_dict() torch.save( state_dict, os.path.join(pths_path, 'model_epoch_{}.pth'.format(epoch + 1))) torch.save( lr_state, os.path.join(pths_path, 'scheduler_epoch_{}.pth'.format(epoch + 1))) print("save model") print('=' * 50)
def test(): cuda = True test_dataset = custom_dataset(split='test') test_data_loader = DataLoader(test_dataset, batch_size=1, shuffle=False) #resnet = Resnet101().eval() resnet = resnet101() rpn = RPN() rcnn = RCNN() if cuda: resnet = resnet.cuda() rpn = rpn.cuda() rcnn = rcnn.cuda() rpn_check_point = torch.load( '/home/licheng/home/licheng/projects/cnet/data/cnet.model.state.19.pkl' ) rpn.load_state_dict(rpn_check_point['rpn']) resnet.load_state_dict(rpn_check_point['resnet']) rcnn_check_point = torch.load( "/home/licheng/home/licheng/projects/cnet/data/rcnn/rcnn_epoch_19.params" ) rcnn.load_state_dict(rcnn_check_point['rcnn']) """ rpn_check_point = torch.load('/home/licheng/home/licheng/projects/cnet/data/rpn/rpn_epoch_19.params') #resnet.load_state_dict(check_point['resnet']) rpn.load_state_dict(rpn_check_point['rpn']) #resnet.load_state_dict(check_point['resnet']) rcnn_check_point = torch.load('/home/licheng/home/licheng/projects/cnet/data/rcnn/rcnn_epoch_16.params') rcnn.load_state_dict(rcnn_check_point['rcnn']) """ pred_bboxes = list() pred_labels = list() pred_scores = list() gt_boxes = list() gt_labels = list() rcnn_target_creator = RCNNTargetCreator() with torch.no_grad(): for img_batch, bndboxes_batch, labels_batch in test_data_loader: img, bndboxes, labels = img_batch, bndboxes_batch[0], labels_batch[ 0] if cuda: img, bndboxes, labels = img.cuda(), bndboxes.cuda( ), labels.cuda() feature = resnet(img.float()) #if cuda: # feature = feature.cuda() rois, anchors, rpn_loc, rpn_score = rpn(feature, feature_stride=16) sample_roi, gt_roi_label, gt_roi_loc = rcnn_target_creator( rois, bndboxes.cpu().numpy(), labels) rois = at.toTensor(rois) roi_cls_loc, roi_score = rcnn(rois, feature) look_score1 = np.array(roi_score.cpu().detach()) pred_score = F.softmax(roi_score, dim=1) look_score1 = np.array(pred_score.cpu().detach()) pred_score = pred_score.cpu().detach().numpy() mean = torch.Tensor( (0., 0., 0., 0.)).repeat(cfg.n_class)[None].cuda() std = torch.Tensor( (0.1, 0.1, 0.2, 0.2)).repeat(cfg.n_class)[None].cuda() roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = at.toTensor(roi_cls_loc) roi_cls_loc = roi_cls_loc.view(-1, cfg.n_class, 4) rois = rois.view(-1, 1, 4).expand_as(roi_cls_loc) # expand dim as loc #rois = rois.reshape(-1, 1, 4)[:, [int(x) for x in np.zeros(cfg.n_class).tolist()], :] #roi_cls_loc = at.toTensor(roi_cls_loc) #roi_cls_loc = roi_cls_loc.view(roi_cls_loc.shape[0], -1, 4) #pred_box = loc2bbox(at.toNumpy(rois).reshape(-1, 4), roi_cls_loc.view(-1, 4).cpu().detach().numpy()) pred_box = loc2bbox( at.toNumpy(rois).reshape(-1, 4), roi_cls_loc.view(-1, 4).cpu().detach().numpy()) # clip box pred_box[:, 0::2] = np.clip(pred_box[:, 0::2], 0, img.shape[3]) pred_box[:, 1::2] = np.clip(pred_box[:, 1::2], 0, img.shape[2]) gt_box = list(bndboxes_batch.cpu().numpy()) gt_label = list(labels_batch.cpu().numpy()) bbox = list() label = list() score = list() for class_index in range(1, cfg.n_class): each_bbox = pred_box.reshape( (-1, cfg.n_class, 4))[:, class_index, :] each_score = pred_score[:, class_index] mask = each_score > cfg.pred_score_thresh each_bbox = each_bbox[mask] each_score = each_score[mask] keep = nms(each_bbox, each_score, cfg.pred_nms_thresh) bbox.append(each_bbox[keep]) score.append(each_score[keep]) label.append(class_index * np.ones((len(keep), ))) bbox = np.concatenate(bbox, axis=0).astype(np.float32) score = np.concatenate(score, axis=0).astype(np.float32) label = np.concatenate(label, axis=0).astype(np.int32) print('gt_info:', gt_box, gt_label) print('sample roi', sample_roi[0]) print('predict info:', bbox, score, label) pred_bboxes += [bbox] pred_scores += [score] pred_labels += [label] gt_boxes += gt_box gt_labels += gt_label result = calc_map(pred_bboxes, pred_labels, pred_scores, gt_boxes, gt_labels) print(result)
''' def __init__(self, val_img_path, val_gt_path, val_num): super(evaluater).__init__() self.val_img_list = [ os.path.join(val_img_path, img_file) for img_file in sorted(os.listdir(val_img_path)) ][:val_num] self.val_gt_list = [ os.path.join(val_gt_path, gt_file) for gt_file in sorted(os.listdir(val_gt_path)) ][:val_num] def evaluate(self, model): for idx in range(len(self.val_img_list)): pass if __name__ == "__main__": trainset = custom_dataset('data/val/img', 'data/val/gt') train_loader = data.DataLoader(trainset, batch_size=4, num_workers=8, drop_last=True) img, gt_score, gt_geo, ignored_map, _ = next(iter(train_loader)) model_path = 'task1/pths/model_epoch_100.pth' model = EAST(pretrained=False) model.load_state_dict(torch.load(model_path)) model.eval() output = model(img) print(1)
def train(train_img_path, train_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval): #数据处理 #import pdb #pdb.set_trace() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") file_num = len(os.listdir(train_img_path)) trainset = custom_dataset(train_img_path, train_gt_path) train_loader = data.DataLoader(trainset, batch_size=batch_size, \ shuffle=True, num_workers=num_workers, drop_last=True) #模型实现 model = EAST() data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) #loss实现 criterion = Loss() #[完善优化算法的调用]写出优化算法的 optimizer = torch.optim.Adam(model.parameters(), lr=lr) #定义学习策略 scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[epoch_iter // 2], gamma=0.1) for epoch in range(epoch_iter): model.train() epoch_loss = 0 epoch_time = time.time() # import pdb # pdb.set_trace() train_process = tqdm(train_loader) for i, (img, gt_score, gt_geo, ignored_map) in enumerate(train_process): start_time = time.time() #import pdb # pdb.set_trace() # print("start_time=%s"%(start_time)) img, gt_score, gt_geo, ignored_map = img.to(device), gt_score.to( device), gt_geo.to(device), ignored_map.to(device) # 使用模型 pred_score, pred_geo = model(img) # 计算得到loss loss = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += loss.item() # 利用loss求取梯度 optimizer.zero_grad() loss.backward() #权重更新 optimizer.step() train_process.set_description_str("epoch:{}".format(epoch + 1)) train_process.set_postfix_str("batch_loss:{:.4f}".format( loss.item())) ''' print('Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format(\ epoch+1, epoch_iter, i+1, int(file_num/batch_size), time.time()-start_time, loss.item())) ''' scheduler.step() with open('train.csv', 'a') as f: f.write('epoch[{}]: epoch_loss is {:.8f}, epoch_time is {:.8f}\n'. format(epoch + 1, epoch_loss / int(file_num / batch_size), time.time() - epoch_time)) # print('epoch_loss is {:.8f}, epoch_time is {:.8f}'.format(epoch_loss/int(file_num/batch_size), time.time()-epoch_time)) # print(time.asctime(time.localtime(time.time()))) # print('='*50) if (epoch + 1) % interval == 0: state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() torch.save( state_dict, os.path.join(pths_path, 'model_epoch_{}.pth'.format(epoch + 1)))
def train(train_img_path, train_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval): file_num = len(os.listdir(train_img_path)) trainset = custom_dataset(train_img_path, train_gt_path) train_loader = data.DataLoader(trainset, batch_size=batch_size, \ shuffle=True, num_workers=num_workers, drop_last=True) test_img_path = os.path.abspath('../ICDAR_2015/test_img') test_gt_path = os.path.abspath('../ICDAR_2015/test_gt') file_num2 = len(os.listdir(test_img_path)) testset = custom_dataset(test_img_path, test_gt_path) test_loader = data.DataLoader(testset, batch_size=batch_size, \ shuffle=True, num_workers=num_workers, drop_last=True) criterion = Loss() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = EAST() data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) try: print("(Continue) Loading east...") checkpoint = torch.load('./pths/east.pth') model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch_dict = checkpoint['epoch_loss'] test_dict = checkpoint['test_loss'] total_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] best_acc = checkpoint['best_acc'] except FileNotFoundError: print("(Initialize) Loading east_vgg16...") model.load_state_dict(torch.load('./pths/east_vgg16.pth')) epoch_dict = dict() test_dict = dict() total_epoch = 0 best_loss = float('inf') best_acc = 0 print("Continue from epoch {}".format(total_epoch)) print("Epoch_dict", epoch_dict) print("Test_dict", test_dict) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[300], gamma=0.1) for epoch in range(epoch_iter): model.train() scheduler.step() epoch_loss = 0 test_loss = 0 epoch_time = time.time() for i, (img, gt_score, gt_geo, ignored_map) in enumerate(train_loader): start_time = time.time() img, gt_score, gt_geo, ignored_map = img.to(device), gt_score.to( device), gt_geo.to(device), ignored_map.to(device) pred_score, pred_geo = model(img) loss = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() print('Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format(\ epoch+1, epoch_iter, i+1, int(file_num/batch_size), time.time()-start_time, loss.item())) epoch_dict[total_epoch + epoch + 1] = (epoch_loss / int(file_num / batch_size), epoch_loss) print('epoch_loss is {:.8f}, epoch_time is {:.8f}, epoch_loss: {}'. format(epoch_loss / int(file_num / batch_size), time.time() - epoch_time, epoch_loss)) model_state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() with torch.no_grad(): for i, (img, gt_score, gt_geo, ignored_map) in enumerate(test_loader): img, gt_score, gt_geo, ignored_map = img.to( device), gt_score.to(device), gt_geo.to( device), ignored_map.to(device) pred_score, pred_geo = model(img) loss = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) test_loss += loss.item() print('Epoch (test) is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format(\ epoch+1, epoch_iter, i+1, int(file_num2/batch_size), time.time()-start_time, loss.item())) test_dict[total_epoch + epoch + 1] = (test_loss / int(file_num2 / batch_size), test_loss) print( 'test_loss is {:.8f}, epoch_time is {:.8f}, test_loss: {}'.format( test_loss / int(file_num2 / batch_size), time.time() - epoch_time, test_loss)) print(time.asctime(time.localtime(time.time()))) print('=' * 50) if (epoch + 1) % interval == 0: torch.save( { 'epoch': total_epoch + epoch + 1, 'model_state_dict': model_state_dict, 'optimizer_state_dict': optimizer.state_dict(), 'epoch_loss': epoch_dict, 'test_loss': test_dict, 'best_loss': best_loss, 'best_acc': best_acc }, os.path.join(pths_path, 'east.pth')) if (total_epoch + epoch + 1) % 10 == 0: torch.save( { 'epoch': total_epoch + epoch + 1, 'model_state_dict': model_state_dict, 'optimizer_state_dict': optimizer.state_dict(), 'epoch_loss': epoch_dict, 'test_loss': test_dict, 'best_loss': best_loss, 'best_acc': best_acc }, os.path.join( pths_path, 'east_epoch_{}.pth'.format(total_epoch + epoch + 1))) if test_loss / int(file_num2 / batch_size) < best_loss: torch.save( { 'epoch': total_epoch + epoch + 1, 'model_state_dict': model_state_dict, 'optimizer_state_dict': optimizer.state_dict(), 'epoch_loss': epoch_dict, 'test_loss': test_dict, 'best_loss': best_loss, 'best_acc': best_acc }, os.path.join(pths_path, 'east_best_loss.pth'))
def train(train_img_path, train_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval): # import pdb # pdb.set_trace() # 加载数据 file_num = len(os.listdir(train_img_path)) trainset = custom_dataset(train_img_path, train_gt_path) train_loader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) # 加载模型 model = EAST() data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # 设置loss criterion = Loss() # [完善优化算法的调用]写出优化算法 optimizer = torch.optim.Adam(model.parameters(), lr=lr) # 定义学习策略, milestones is a list of epoch indices, and ust be increasing. scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[epoch_iter // 2], gamma=.1) for epoch in range(epoch_iter): model.train() # when epoch meets epoch_iter // 2, # this scheduler will schedule learning rate scheduler.step() epoch_loss = 0 epoch_time = time.time() for i, (img, gt_score, gt_geo, ignored_map) in enumerate(train_loader): start_time = time.time() print("start_time=%s" % start_time) # import pdb # pdb.set_trace() img, gt_score, gt_geo, ignored_map = img.to(device), gt_score.to(device), \ gt_geo.to(device), ignored_map.to(device) # 前向反馈 pred_score, pred_geo = model(img) # 计算loss loss = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += loss.item() # 反向传播,优化器梯度需先清零! optimizer.zero_grad() loss.backward() # 模型权重更新 optimizer.step() print( 'Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}' .format(epoch + 1, epoch_iter, i + 1, int(file_num / batch_size), time.time() - start_time, loss.item())) print('epoch_loss is {:.8f}, epoch_time is {:.8f}'.format( epoch_loss / int(file_num / batch_size), time.time() - epoch_time)) print(time.asctime(time.localtime(time.time()))) print('=' * 50) # 每5个周期保存一下模型的权重 if (epoch + 1) % interval == 0: state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() torch.save( state_dict, os.path.join(pths_path, 'model_epoch_{}.pth'.format(epoch + 1)))
def train(train_img_path, train_gt_path, pths_path, batch_size, lr, num_workers, epoch_iter, interval, writer): file_num = len(os.listdir(train_img_path)) trainset = custom_dataset(train_img_path, train_gt_path, args.min_len, args.crop_size) train_loader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) criterion = Loss() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # if args.resume is None: ### 从头开始训练 # model = EAST_MobileV2(args.crop_size, True) # else: ### 从checkpoint 处恢复训练 # model = EAST_MobileV2(args.crop_size, False) # print('Resuming training, loading {}...'.format(args.resume)) # model.load_state_dict(torch.load(args.resume)) model = EAST_SENet() data_parallel = False #if torch.cuda.device_count() > 10: #model = nn.DataParallel(model, device_ids=[0, 1]) #data_parallel = True model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[200], gamma=0.1) for epoch in range(args.start_epoch, epoch_iter): model.train() scheduler.step() epoch_loss = 0 epoch_time = time.time() for i, (img, gt_score, gt_geo, ignored_map) in enumerate(train_loader): start_time = time.time() img, gt_score, gt_geo, ignored_map = img.to(device), gt_score.to( device), gt_geo.to(device), ignored_map.to(device) pred_score, pred_geo = model(img) loss = criterion(gt_score, pred_score, gt_geo, pred_geo, ignored_map) epoch_loss += float(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() steps = epoch * len(train_loader) + i writer.add_scalar('loss', loss, steps) print('Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format(\ epoch+1, epoch_iter, i+1, int(file_num/batch_size), time.time()-start_time, loss.item())) print('epoch_loss is {:.8f}, epoch_time is {:.8f}'.format( epoch_loss / int(file_num / batch_size), time.time() - epoch_time)) print(time.asctime(time.localtime(time.time()))) print('=' * 50) if (epoch + 1) % interval == 0: state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() torch.save( state_dict, os.path.join(pths_path, 'model_SE_epoch_{}.pth'.format(epoch + 1))) writer.close()