def train_infinite_collect_stats(args, model, device, source_train_loader, target_train_loader, optimizer, lambda_mec_loss, target_test_loader): source_iter = iter(source_train_loader) target_iter = iter(target_train_loader) exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[6000], gamma=0.1) for i in range(args.num_iters): model.train() exp_lr_scheduler.step() try: source_data, source_y = next(source_iter) except: source_iter = iter(source_train_loader) source_data, source_y = next(source_iter) try: target_data, target_data_dup, _ = next(target_iter) except: target_iter = iter(target_train_loader) target_data, target_data_dup, _ = next(target_iter) data = torch.cat((source_data, target_data, target_data_dup), dim=0) # concat the source and target mini-batches data, source_y = data.to(device), source_y.to(device) optimizer.zero_grad() output = model(data) source_output, target_output, target_output_dup = torch.split( output, split_size_or_sections=output.shape[0] // 3, dim=0) mec_criterion = consensus_loss.MinEntropyConsensusLoss( num_classes=args.num_classes, device=device) cls_loss = F.nll_loss(F.log_softmax(source_output), source_y) mec_loss = lambda_mec_loss * mec_criterion(target_output, target_output_dup) loss = cls_loss + mec_loss loss.backward() optimizer.step() if i % args.log_interval == 0: print( 'Train Iter: [{}/{}]\tClassification Loss: {:.6f} \t MEC Loss: {:.6f}' .format(i, args.num_iters, cls_loss.item(), mec_loss.item())) if (i + 1) % args.check_acc_step == 0: test(args, model, device, target_test_loader) print("Training is complete...") print( "Running a bunch of forward passes to estimate the population statistics of target..." ) eval_pass_collect_stats(args, model, device, target_test_loader) print("Finally computing the precision on the test set...") test(args, model, device, target_test_loader)
def train_meshnet(opt): device = torch.device("cpu" if opt.gpu_idx < 0 else "cuda:%d" % opt.gpu_idx) print('Training on {} GPUs'.format(torch.cuda.device_count())) print('Training on ' + ( 'cpu' if opt.gpu_idx < 0 else torch.cuda.get_device_name(opt.gpu_idx))) # colored console output green = lambda x: '\033[92m' + x + '\033[0m' blue = lambda x: '\033[94m' + x + '\033[0m' log_dirname = os.path.join(opt.logdir, opt.name) params_filename = os.path.join(opt.outdir, '%s_params.pth' % opt.name) model_filename = os.path.join(opt.outdir, '%s_model.pth' % opt.name) desc_filename = os.path.join(opt.outdir, '%s_description.txt' % opt.name) if os.path.exists(log_dirname) or os.path.exists(model_filename): if opt.name != 'test': response = input( 'A training run named "%s" already exists, overwrite? (y/n) ' % opt.name) if response == 'y': del_log = True else: return else: del_log = True if del_log: if os.path.exists(log_dirname): try: shutil.rmtree(log_dirname) except OSError: print("Can't delete " + log_dirname) # get indices in targets and predictions corresponding to each output target_features = [] output_target_ind = [] output_pred_ind = [] output_names = [] output_loss_weights = dict() pred_dim = 0 for o in opt.outputs: if o == 'imp_surf': if o not in target_features: target_features.append(o) output_names.append(o) output_target_ind.append(target_features.index(o)) output_pred_ind.append(pred_dim) output_loss_weights[o] = 1.0 pred_dim += 1 elif o == 'imp_surf_magnitude': if o not in target_features: target_features.append(o) output_names.append(o) output_target_ind.append(target_features.index(o)) output_pred_ind.append(pred_dim) # output_loss_weights[o] = 10.0 output_loss_weights[o] = 1.0 pred_dim += 1 elif o == 'imp_surf_sign': if o not in target_features: target_features.append(o) output_names.append(o) output_target_ind.append(target_features.index(o)) output_pred_ind.append(pred_dim) output_loss_weights[o] = 1.0 pred_dim += 1 elif o == 'p_index': if o not in target_features: target_features.append(o) output_target_ind.append(target_features.index(o)) elif o == 'patch_pts_ids': if o not in target_features: target_features.append(o) output_target_ind.append(target_features.index(o)) else: raise ValueError('Unknown output: %s' % o) if pred_dim <= 0: raise ValueError('Prediction is empty for the given outputs.') # create model use_query_point = any([ f in opt.outputs for f in ['imp_surf', 'imp_surf_magnitude', 'imp_surf_sign'] ]) meshnet = PointsToSurfModel( net_size_max=opt.net_size, num_points=opt.points_per_patch, output_dim=pred_dim, use_point_stn=opt.use_point_stn, use_feat_stn=opt.use_feat_stn, sym_op=opt.sym_op, use_query_point=use_query_point, sub_sample_size=opt.sub_sample_size, do_augmentation=True, single_transformer=opt.single_transformer, shared_transformer=opt.shared_transformer, ) start_epoch = 0 if opt.refine != '': print(f'Refining weights from {opt.refine}') meshnet.cuda(device=device) # same order as in training meshnet = torch.nn.DataParallel(meshnet) meshnet.load_state_dict(torch.load(opt.refine)) try: # expecting a file name like 'vanilla_model_50.pth' model_file = str(opt.refine) last_underscore_pos = model_file.rfind('_') last_dot_pos = model_file.rfind('.') start_epoch = int( model_file[last_underscore_pos + 1:last_dot_pos]) + 1 print(f'Continuing training from epoch {start_epoch}') except: print( f'Warning: {opt.refine} has no epoch in the name. The Tensorboard log will continue at ' f'epoch 0 and might be messed up!') if opt.seed < 0: opt.seed = random.randint(1, 10000) print("Random Seed: %d" % opt.seed) random.seed(opt.seed) torch.manual_seed(opt.seed) # create train and test dataset loaders train_dataset = data_loader.PointcloudPatchDataset( root=opt.indir, shape_list_filename=opt.trainset, points_per_patch=opt.points_per_patch, patch_features=target_features, point_count_std=opt.patch_point_count_std, seed=opt.seed, identical_epochs=opt.identical_epochs, center=opt.patch_center, cache_capacity=opt.cache_capacity, pre_processed_patches=True, sub_sample_size=opt.sub_sample_size, num_workers=int(opt.workers), patch_radius=opt.patch_radius, epsilon=-1, # not necessary for training uniform_subsample=opt.uniform_subsample, ) if opt.training_order == 'random': train_datasampler = data_loader.RandomPointcloudPatchSampler( train_dataset, patches_per_shape=opt.patches_per_shape, seed=opt.seed, identical_epochs=opt.identical_epochs) elif opt.training_order == 'random_shape_consecutive': train_datasampler = data_loader.SequentialShapeRandomPointcloudPatchSampler( train_dataset, patches_per_shape=opt.patches_per_shape, seed=opt.seed, identical_epochs=opt.identical_epochs) else: raise ValueError('Unknown training order: %s' % opt.training_order) train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler=train_datasampler, batch_size=opt.batchSize, num_workers=int( opt.workers)) test_dataset = data_loader.PointcloudPatchDataset( root=opt.indir, shape_list_filename=opt.testset, points_per_patch=opt.points_per_patch, patch_features=target_features, point_count_std=opt.patch_point_count_std, seed=opt.seed, identical_epochs=opt.identical_epochs, center=opt.patch_center, cache_capacity=opt.cache_capacity, pre_processed_patches=True, sub_sample_size=opt.sub_sample_size, patch_radius=opt.patch_radius, num_workers=int(opt.workers), epsilon=-1, # not necessary for training uniform_subsample=opt.uniform_subsample, ) if opt.training_order == 'random': test_datasampler = data_loader.RandomPointcloudPatchSampler( test_dataset, patches_per_shape=opt.patches_per_shape, seed=opt.seed, identical_epochs=opt.identical_epochs) elif opt.training_order == 'random_shape_consecutive': test_datasampler = data_loader.SequentialShapeRandomPointcloudPatchSampler( test_dataset, patches_per_shape=opt.patches_per_shape, seed=opt.seed, identical_epochs=opt.identical_epochs) else: raise ValueError('Unknown training order: %s' % opt.training_order) test_dataloader = torch.utils.data.DataLoader(test_dataset, sampler=test_datasampler, batch_size=opt.batchSize, num_workers=int(opt.workers)) # keep the exact training shape names for later reference opt.train_shapes = train_dataset.shape_names opt.test_shapes = test_dataset.shape_names print( 'training set: %d patches (in %d batches) - test set: %d patches (in %d batches)' % (len(train_datasampler), len(train_dataloader), len(test_datasampler), len(test_dataloader))) try: os.makedirs(opt.outdir) except OSError: pass train_fraction_done = 0.0 log_writer = SummaryWriter(log_dirname, comment=opt.name) log_writer.add_scalar('LR', opt.lr, 0) # milestones in number of optimizer iterations optimizer = optim.SGD(meshnet.parameters(), lr=opt.lr, momentum=opt.momentum) # SGD changes lr depending on training progress # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=0.1) # constant lr scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=opt.scheduler_steps, gamma=0.1) if opt.refine == '': meshnet.cuda(device=device) meshnet = torch.nn.DataParallel(meshnet) train_num_batch = len(train_dataloader) test_num_batch = len(test_dataloader) # save parameters torch.save(opt, params_filename) # save description with open(desc_filename, 'w+') as text_file: print(opt.desc, file=text_file) for epoch in range(start_epoch, opt.nepoch, 1): train_enum = enumerate(train_dataloader, 0) test_batchind = -1 test_fraction_done = 0.0 test_enum = enumerate(test_dataloader, 0) for train_batchind, batch_data_train in train_enum: # batch data to GPU for key in batch_data_train.keys(): batch_data_train[key] = batch_data_train[key].cuda( non_blocking=True) # set to training mode meshnet.train() # zero gradients optimizer.zero_grad() pred_train = meshnet(batch_data_train) loss_train = compute_loss(pred=pred_train, batch_data=batch_data_train, outputs=opt.outputs, output_loss_weights=output_loss_weights, fixed_radius=opt.patch_radius > 0.0) loss_total = sum(loss_train) # back-propagate through entire network to compute gradients of loss w.r.t. parameters loss_total.backward() # parameter optimization step optimizer.step() # update and log lr lr_before_update = scheduler.get_lr() if isinstance(lr_before_update, list): lr_before_update = lr_before_update[0] scheduler.step(epoch) lr_after_update = scheduler.get_lr() if isinstance(lr_after_update, list): lr_after_update = lr_after_update[0] if lr_before_update != lr_after_update: print('LR changed from {} to {} in epoch {}'.format( lr_before_update, lr_after_update, epoch)) current_step = ( epoch + train_fraction_done) * train_num_batch * opt.batchSize log_writer.add_scalar('LR', lr_after_update, current_step) train_fraction_done = (train_batchind + 1) / train_num_batch if debug: from source import evaluation evaluation.visualize_patch( patch_pts_ps=batch_data_train['patch_pts_ps'][0].cpu(), query_point_ps=batch_data_train['imp_surf_query_point_ps'] [0].cpu(), pts_sub_sample_ms=batch_data_train['pts_sub_sample_ms'] [0].cpu(), query_point_ms=batch_data_train['imp_surf_query_point_ms'] [0].cpu(), file_path='debug/patch_train.off') metrics_dict = calc_metrics(outputs=opt.outputs, pred=pred_train, gt_data=batch_data_train) do_logging(writer=log_writer, log_prefix=green('train'), epoch=epoch, opt=opt, loss=loss_train, batchind=train_batchind, fraction_done=train_fraction_done, num_batch=train_num_batch, train=True, output_names=output_names, metrics_dict=metrics_dict) while test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch: # set to evaluation mode, no auto-diff meshnet.eval() test_batchind, batch_data_test = next(test_enum) # batch data to GPU for key in batch_data_test.keys(): batch_data_test[key] = batch_data_test[key].cuda( non_blocking=True) # forward pass with torch.no_grad(): pred_test = meshnet(batch_data_test) loss_test = compute_loss( pred=pred_test, batch_data=batch_data_test, outputs=opt.outputs, output_loss_weights=output_loss_weights, fixed_radius=opt.patch_radius > 0.0) metrics_dict = calc_metrics(outputs=opt.outputs, pred=pred_test, gt_data=batch_data_test) test_fraction_done = (test_batchind + 1) / test_num_batch do_logging(writer=log_writer, log_prefix=blue('test'), epoch=epoch, opt=opt, loss=loss_test, batchind=test_batchind, fraction_done=test_fraction_done, num_batch=train_num_batch, train=False, output_names=output_names, metrics_dict=metrics_dict) # end of epoch save model, overwriting the old model if epoch % opt.saveinterval == 0 or epoch == opt.nepoch - 1: torch.save(meshnet.state_dict(), model_filename) # save model in a separate file in epochs 0,5,10,50,100,500,1000, ... if epoch % (5 * 10**math.floor(math.log10(max(2, epoch - 1))) ) == 0 or epoch % 100 == 0 or epoch == opt.nepoch - 1: torch.save( meshnet.state_dict(), os.path.join(opt.outdir, '%s_model_%d.pth' % (opt.name, epoch))) log_writer.flush() log_writer.close()
def main(): torch.manual_seed(args.seed) if not args.use_avai_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_imgreid_dataset( root=args.root, name=args.dataset, split_id=args.split_id, cuhk03_labeled=args.cuhk03_labeled, cuhk03_classic_split=args.cuhk03_classic_split, ) transform_train = T.Compose([ T.Random2DTranslation(args.height, args.width), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) transform_test = T.Compose([ T.Resize((args.height, args.width)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False trainloader = DataLoader( ImageDataset(dataset.train, transform=transform_train), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( ImageDataset(dataset.query, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( ImageDataset(dataset.gallery, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, loss={'xent'}, use_gpu=use_gpu) print("Model size: {:.3f} M".format(count_num_param(model))) if args.label_smooth: criterion = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids, use_gpu=use_gpu) else: criterion = nn.CrossEntropyLoss() optimizer = init_optim(args.optim, model.parameters(), args.lr, args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) if args.fixbase_epoch > 0: if hasattr(model, 'classifier') and isinstance(model.classifier, nn.Module): optimizer_tmp = init_optim(args.optim, model.classifier.parameters(), args.fixbase_lr, args.weight_decay) else: print( "Warn: model has no attribute 'classifier' and fixbase_epoch is reset to 0" ) args.fixbase_epoch = 0 if args.load_weights: # load pretrained weights but ignore layers that don't match in size if check_isfile(args.load_weights): checkpoint = torch.load(args.load_weights) pretrain_dict = checkpoint['state_dict'] model_dict = model.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size() } model_dict.update(pretrain_dict) model.load_state_dict(model_dict) print("Loaded pretrained weights from '{}'".format( args.load_weights)) if args.resume: if check_isfile(args.resume): checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] rank1 = checkpoint['rank1'] print("Loaded checkpoint from '{}'".format(args.resume)) print("- start_epoch: {}\n- rank1: {}".format( args.start_epoch, rank1)) if use_gpu: model = nn.DataParallel(model).cuda() if args.evaluate: print("Evaluate only") distmat = test(model, queryloader, galleryloader, use_gpu, return_distmat=True) if args.vis_ranked_res: visualize_ranked_results( distmat, dataset, save_dir=osp.join(args.save_dir, 'ranked_results'), topk=20, ) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") if args.fixbase_epoch > 0: print( "Train classifier for {} epochs while keeping base network frozen". format(args.fixbase_epoch)) for epoch in range(args.fixbase_epoch): start_train_time = time.time() train(epoch, model, criterion, optimizer_tmp, trainloader, use_gpu, freeze_bn=True) train_time += round(time.time() - start_train_time) del optimizer_tmp print("Now open all layers for training") for epoch in range(args.start_epoch, args.max_epoch): start_train_time = time.time() train(epoch, model, criterion, optimizer, trainloader, use_gpu) train_time += round(time.time() - start_train_time) scheduler.step() if (epoch + 1) > args.start_eval and args.eval_step > 0 and ( epoch + 1) % args.eval_step == 0 or (epoch + 1) == args.max_epoch: print("==> Test") rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
if args.pretrained: optimizer = torch.optim.SGD( [{'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': args.lr}, {'params': filter(lambda p: p.requires_grad, metric.parameters()), 'lr': 10 * args.lr}], weight_decay=args.weight_decay, momentum=args.momentum) else: optimizer = torch.optim.SGD( [{'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': args.lr}, {'params': filter(lambda p: p.requires_grad, metric.parameters()), 'lr': args.lr}], weight_decay=args.weight_decay, momentum=args.momentum) print('init_lr={}, weight_decay={}, momentum={}'.format(args.lr, args.weight_decay, args.momentum)) if args.scheduler == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_step, gamma=args.lr_gamma, last_epoch=-1) elif args.scheduler == 'multi': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[150, 225], gamma=args.lr_gamma, last_epoch=-1) # savepath savepath = os.path.join(args.savepath, args.model_name) savepath = savepath + '_' + args.metric savepath = savepath + '_' + args.loss + '_' + str(args.inp_size) + '_' + args.scheduler if args.seed is not None: savepath = savepath + '_s' + str(args.seed) if not args.pretrained: savepath = savepath + '_' + str(args.eps) print('savepath:', savepath)
def create_optimizer(optimizer_config, model): """Creates optimizer and schedule from configuration Parameters ---------- optimizer_config : dict Dictionary containing the configuration options for the optimizer. model : Model The network model. Returns ------- optimizer : Optimizer The optimizer. scheduler : LRScheduler The learning rate scheduler. """ if optimizer_config["classifier_lr"] != -1: # Separate classifier parameters from all others net_params = [] classifier_params = [] for k, v in model.named_parameters(): if k.find("fc") != -1: classifier_params.append(v) else: net_params.append(v) params = [ {"params": net_params}, {"params": classifier_params, "lr": optimizer_config["classifier_lr"]}, ] else: params = model.parameters() if optimizer_config["type"] == "SGD": optimizer = optim.SGD(params, lr=optimizer_config["learning_rate"], momentum=optimizer_config["momentum"], weight_decay=optimizer_config["weight_decay"], nesterov=optimizer_config["nesterov"]) elif optimizer_config["type"] == "Adam": optimizer = optim.Adam(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) else: raise KeyError("unrecognized optimizer {}".format(optimizer_config["type"])) if optimizer_config["schedule"]["type"] == "step": scheduler = lr_scheduler.StepLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "multistep": scheduler = lr_scheduler.MultiStepLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "exponential": scheduler = lr_scheduler.ExponentialLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "constant": scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0) elif optimizer_config["schedule"]["type"] == "linear": def linear_lr(it): return it * optimizer_config["schedule"]["params"]["alpha"] + optimizer_config["schedule"]["params"]["beta"] scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr) return optimizer, scheduler
def main(): global args torch.manual_seed(args.seed) if not args.use_avai_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False log_name = 'log_test.txt' if args.evaluate else 'log_train.txt' sys.stderr = sys.stdout = Logger(osp.join(args.save_dir, log_name)) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU, however, GPU is highly recommended") print("Initializing image data manager") dm = ImageDataManager(use_gpu, **image_dataset_kwargs(args)) trainloader, testloader_dict = dm.return_dataloaders() print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dm.num_train_pids, loss={'xent'}, use_gpu=use_gpu, args=vars(args)) print(model) print("Model size: {:.3f} M".format(count_num_param(model))) criterion = get_criterion(dm.num_train_pids, use_gpu, args) regularizer = get_regularizer(vars(args)) optimizer = init_optimizer(model.parameters(), **optimizer_kwargs(args)) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) if args.load_weights and check_isfile(args.load_weights): # load pretrained weights but ignore layers that don't match in size try: checkpoint = torch.load(args.load_weights) except Exception as e: print(e) checkpoint = torch.load(args.load_weights, map_location={'cuda:0': 'cpu'}) pretrain_dict = checkpoint['state_dict'] model_dict = model.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size() } model_dict.update(pretrain_dict) model.load_state_dict(model_dict) print("Loaded pretrained weights from '{}'".format(args.load_weights)) if args.resume and check_isfile(args.resume): checkpoint = torch.load(args.resume) state = model.state_dict() state.update(checkpoint['state_dict']) model.load_state_dict(state) # args.start_epoch = checkpoint['epoch'] + 1 print("Loaded checkpoint from '{}'".format(args.resume)) print("- start_epoch: {}\n- rank1: {}".format(args.start_epoch, checkpoint['rank1'])) if use_gpu: model = nn.DataParallel(model).cuda() if args.evaluate: print("Evaluate only") for name in args.target_names: print("Evaluating {} ...".format(name)) queryloader = testloader_dict[name]['query'], testloader_dict[ name]['query_flip'] galleryloader = testloader_dict[name]['gallery'], testloader_dict[ name]['gallery_flip'] distmat = test(model, queryloader, galleryloader, use_gpu, return_distmat=True) if args.visualize_ranks: visualize_ranked_results(distmat, dm.return_testdataset_by_name(name), save_dir=osp.join( args.save_dir, 'ranked_results', name), topk=20) return start_time = time.time() ranklogger = RankLogger(args.source_names, args.target_names) train_time = 0 print("==> Start training") if args.fixbase_epoch > 0: oldenv = os.environ.get('sa', '') os.environ['sa'] = '' print( "Train {} for {} epochs while keeping other layers frozen".format( args.open_layers, args.fixbase_epoch)) initial_optim_state = optimizer.state_dict() for epoch in range(args.fixbase_epoch): start_train_time = time.time() train(epoch, model, criterion, regularizer, optimizer, trainloader, use_gpu, fixbase=True) train_time += round(time.time() - start_train_time) print("Done. All layers are open to train for {} epochs".format( args.max_epoch)) optimizer.load_state_dict(initial_optim_state) os.environ['sa'] = oldenv max_r1 = 0 for epoch in range(args.start_epoch, args.max_epoch): start_train_time = time.time() print(epoch) print(criterion) train(epoch, model, criterion, regularizer, optimizer, trainloader, use_gpu, fixbase=False) train_time += round(time.time() - start_train_time) if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': 0, 'epoch': epoch, }, False, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) scheduler.step() if (epoch + 1) > args.start_eval and args.eval_freq > 0 and ( epoch + 1) % args.eval_freq == 0 or (epoch + 1) == args.max_epoch: print("==> Test") for name in args.target_names: print("Evaluating {} ...".format(name)) queryloader = testloader_dict[name]['query'], testloader_dict[ name]['query_flip'] galleryloader = testloader_dict[name][ 'gallery'], testloader_dict[name]['gallery_flip'] rank1 = test(model, queryloader, galleryloader, use_gpu) ranklogger.write(name, epoch + 1, rank1) if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() if max_r1 < rank1: print('Save!', max_r1, rank1) save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, False, osp.join(args.save_dir, 'checkpoint_best.pth.tar')) max_r1 = rank1 elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time)) ranklogger.show_summary()
def __init__(self, opt): super(ICPR_model, self).__init__(opt) train_opt = opt['train'] # define networks and load pretrained models self.netG = networks.define_G1(opt).to(self.device) # G1 if self.is_train: self.netV = networks.define_D(opt).to(self.device) # G1 self.netD = networks.define_D2(opt).to(self.device) self.netQ = networks.define_Q(opt).to(self.device) self.netG.train() self.netV.train() self.netD.train() self.load() # load G and D if needed # define losses, optimizer and scheduler if self.is_train: # G pixel loss if train_opt['pixel_weight'] > 0: l_pix_type = train_opt['pixel_criterion'] if l_pix_type == 'l1': self.cri_pix = nn.L1Loss().to(self.device) elif l_pix_type == 'l2': self.cri_pix = nn.MSELoss().to(self.device) else: raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_pix_type)) self.l_pix_w = train_opt['pixel_weight'] else: logger.info('Remove pixel loss.') self.cri_pix = None self.weight_kl = 1e-2 self.weight_D = 1e-4 self.l_gan_w = 1e-4 # G feature loss if train_opt['feature_weight'] > 0: l_fea_type = train_opt['feature_criterion'] if l_fea_type == 'l1': self.cri_fea = nn.L1Loss().to(self.device) elif l_fea_type == 'l2': self.cri_fea = nn.MSELoss().to(self.device) else: raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_fea_type)) self.l_fea_w = train_opt['feature_weight'] else: logger.info('Remove feature loss.') self.cri_fea = None if self.cri_fea: # load VGG perceptual loss self.netF = networks.define_F(opt, use_bn=False,Rlu=True).to(self.device) #Rlu=True if feature taken before relu, else false self.cri_gan = GANLoss(train_opt['gan_type'], 1.0, 0.0).to(self.device) # optimizers # G wd_G = train_opt['weight_decay_G'] if train_opt['weight_decay_G'] else 0 optim_params = [] for k, v in self.netG.named_parameters(): # can optimize for a part of the model if v.requires_grad: optim_params.append(v) else: logger.warning('Params [{:s}] will not optimize.'.format(k)) self.optimizer_G = torch.optim.Adam(optim_params, lr=train_opt['lr_G'], \ weight_decay=wd_G, betas=(train_opt['beta1_G'], 0.999)) self.optimizers.append(self.optimizer_G) #D wd_D = train_opt['weight_decay_D'] if train_opt['weight_decay_D'] else 0 self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=train_opt['lr_D'], \ weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999)) self.optimizers.append(self.optimizer_D) self.optimizer_V = torch.optim.Adam(self.netV.parameters(), lr=train_opt['lr_D'], \ weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999)) self.optimizers.append(self.optimizer_V) # schedulers if train_opt['lr_scheme'] == 'MultiStepLR': for optimizer in self.optimizers: self.schedulers.append(lr_scheduler.MultiStepLR(optimizer, \ train_opt['lr_steps'], train_opt['lr_gamma'])) else: raise NotImplementedError('MultiStepLR learning rate scheme is enough.') self.log_dict = OrderedDict() # print network self.print_network()
if __name__ == '__main__': args.workspace = os.path.join(args.workspace, args.exp_name) os.makedirs(args.workspace, exist_ok=True) logger = setup_logger(os.path.join(args.workspace, 'train_icdar15_log')) criterion = Loss() device = torch.device("cuda") model = EAST() data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[args.epoch_iter // 2], gamma=0.1) # 先产生第一次的pseudo-label logger.info("loading pretrained model from ", args.resume) # model.load_state_dict(torch.load(args.resume)) # # target domain trainset = ICDAR15(args.train_data, args.train_gt) train_loader_target = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True)
def main(opt): lr1 = opt.lr lr_steps = opt.lr_steps gpu_idx = opt.gpu_idx batch_size = opt.batch_size arch = opt.arch logdir = os.path.join(opt.logdir, arch) os.makedirs(logdir, exist_ok=True) n_epochs = opt.n_epochs train_writer = SummaryWriter(os.path.join(logdir, 'train')) test_writer = SummaryWriter(os.path.join(logdir, 'test')) if not gpu_idx == 999: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_idx) # non-functional torch.cuda.set_device(0) device = torch.device("cuda:0") best_prec1 = 0 res = 256 center_crop = 224 train_transform = transforms.Compose([ transforms.Resize(res), transforms.CenterCrop(center_crop), transforms.RandomHorizontalFlip(), transforms.ToTensor(), # because inpus dtype is PIL Image transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) val_transform = transforms.Compose([ transforms.Resize(res), transforms.CenterCrop(center_crop), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) # Data loaders dataset_path = opt.dataset_path training_set = torchvision.datasets.ImageFolder(os.path.join( dataset_path, 'train'), transform=train_transform) if opt.data_sampler == 'weighted': train_sampler_weights = make_weights_for_balanced_classes( training_set.imgs, len(training_set.classes)) train_sampler_weights = torch.DoubleTensor(train_sampler_weights) train_sampler = torch.utils.data.sampler.WeightedRandomSampler( train_sampler_weights, len(train_sampler_weights)) train_loader = data.DataLoader(training_set, sampler=train_sampler, batch_size=batch_size, num_workers=8, pin_memory=True) else: train_loader = data.DataLoader(training_set, batch_size=batch_size, num_workers=5, pin_memory=True) val_set = torchvision.datasets.ImageFolder(os.path.join( dataset_path, 'test'), transform=val_transform) val_loader = data.DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=5, pin_memory=True) num_classes = 33 if opt.arch == 'vgg': # use vgg architecture # model_ft = vgg19(pretrained=True) ##### Model Structure Here model_ft = models.__dict__['vgg19'](pretrained=True) model_ft.classifier[6] = nn.Linear( 4096, num_classes) # change last layer to fit the number of classes elif 'resnet' in opt.arch: # use resnet architecture model_ft = models.__dict__[opt.arch](pretrained=True) if opt.arch == 'resnet18' or opt.arch == 'resnet34': model_ft.fc = nn.Linear(512, num_classes) elif opt.arch == 'resnet50' or opt.arch == 'resnet101' or opt.arch == 'resnet152': model_ft = models.__dict__[opt.arch](pretrained=True) model_ft.fc = nn.Linear(2048, num_classes) else: raise ValueError("unsupported architecture") if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model_ft = nn.DataParallel(model_ft) if opt.refine: if opt.refine_epoch == 0: raise ValueError( "You set the refine epoch to 0. No need to refine, just retrain." ) refine_model_filename = os.path.join( logdir, 'classifier{}.pth'.format(opt.refine_epoch)) model_ft.load_state_dict(torch.load(refine_model_filename)) model_ft.to(device) criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model_ft.parameters(), lr=lr1, momentum=0.9, weight_decay=5e-4) scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=opt.lr_steps, gamma=0.1) # milestones in number of optimizer iterations refine_flag = True for epoch in range(1, n_epochs): if epoch <= opt.refine_epoch and opt.refine and refine_flag: scheduler.step() continue else: refine_flag = False # adjust_learning_rate(optimizer, epoch, lr1, lr_steps) train_fraction_done = 0.0 test_batchind = -1 test_fraction_done = 0.0 test_enum = enumerate(val_loader, 0) # train for one epoch losses = utils.AverageMeter() top1 = utils.AverageMeter() top3 = utils.AverageMeter() model_ft.train() optimizer.zero_grad() train_num_batch = len(train_loader) test_num_batch = len(val_loader) for train_batchind, (im_data, im_class) in enumerate(train_loader): model_ft.train im_data = im_data.to(device) im_class = im_class.to(device) # batch_size = im_data.shape[0] optimizer.zero_grad() output = model_ft(im_data) # measure accuracy and record loss prec1, prec3 = utils.accuracy(output.data.detach(), im_class, topk=(1, 3)) loss = criterion(output, im_class) loss.backward() # compute gradient and do SGD step optimizer.step() losses.update(loss.item(), im_data.size(0)) top1.update(prec1.item(), im_data.size(0)) top3.update(prec3.item(), im_data.size(0)) print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@3 {top3.val:.3f} ({top3.avg:.3f})'.format( epoch, train_batchind + 1, len(train_loader) + 1, loss=losses, top1=top1, top3=top3)) train_fraction_done = (train_batchind + 1) / train_num_batch train_writer.add_scalar('loss', losses.val, (epoch + train_fraction_done) * train_num_batch * batch_size) train_writer.add_scalar('top1', top1.val, (epoch + train_fraction_done) * train_num_batch * batch_size) train_writer.add_scalar('top3', top3.val, (epoch + train_fraction_done) * train_num_batch * batch_size) train_fraction_done = (train_batchind + 1) / train_num_batch # evaluate on a fraction of the validation set if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch: test_losses = utils.AverageMeter() test_top1 = utils.AverageMeter() test_top3 = utils.AverageMeter() # switch to evaluate mode model_ft.eval() test_batchind, (im_data, im_class) = next(test_enum) with torch.no_grad(): im_data = im_data.to(device) im_class = im_class.to(device) # compute output output = model_ft(im_data) test_loss = criterion(output, im_class) # measure accuracy and record loss prec1, prec3 = utils.accuracy(output.data, im_class, topk=(1, 3)) test_losses.update(test_loss.item(), im_data.size(0)) test_top1.update(prec1.item(), im_data.size(0)) test_top3.update(prec3.item(), im_data.size(0)) print('Test: [{0}/{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@3 {top3.val:.3f} ({top3.avg:.3f})\t'.format( test_batchind, len(val_loader), loss=test_losses, top1=test_top1, top3=test_top3)) test_writer.add_scalar('loss', test_losses.val, (epoch + train_fraction_done) * train_num_batch * batch_size) test_writer.add_scalar('top1', test_top1.val, (epoch + train_fraction_done) * train_num_batch * batch_size) test_writer.add_scalar('top3', test_top3.val, (epoch + train_fraction_done) * train_num_batch * batch_size) test_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], (epoch + train_fraction_done) * train_num_batch * batch_size) test_fraction_done = (test_batchind + 1) / test_num_batch scheduler.step() # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if (epoch + 1) % 2 == 0: # save model model_tmp = copy.deepcopy(model_ft.state_dict()) model_ft.load_state_dict(model_tmp) torch.save( model_ft.state_dict(), os.path.join(logdir, 'classifier' + str(epoch) + '.pth')) if (is_best): model_tmp = copy.deepcopy(model_ft.state_dict()) model_ft.load_state_dict(model_tmp) torch.save(model_ft.state_dict(), os.path.join(logdir, 'best_classifier.pth'))
batch_size=BATCH_SIZE , shuffle=True, num_workers=1) ## Initializing r, theta P,Pall = gridRing(N) Drr = abs(P) Drr = torch.from_numpy(Drr).float() Dtheta = np.angle(P) Dtheta = torch.from_numpy(Dtheta).float() # What and where is gamma ## Create the model model = OFModel(Drr, Dtheta, T, PRE, gpu_id) model.cuda(gpu_id) optimizer = torch.optim.Adam(model.parameters(), lr=LR) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[100,150], gamma=0.1) # if Kitti: milestones=[100,150], UCF [50,100] loss_mse = nn.MSELoss() start_epoch = 1 ## If want to continue training from a checkpoint if(load_ckpt): loadedcheckpoint = torch.load(ckpt_file) start_epoch = loadedcheckpoint['epoch'] model.load_state_dict(loadedcheckpoint['state_dict']) optimizer.load_state_dict(loadedcheckpoint['optimizer']) print("Training from epoch: ", start_epoch) print('-' * 25) start = time.time() count = 0
print(nets) gpus = [int(gpu) for gpu in args.gpus.split(',')] if len(gpus) > 1: print("Using GPUs {}.".format(gpus)) for net in nets: net = nn.DataParallel(net, device_ids=gpus) params = [{'params': net.parameters()} for net in nets] solver = optim.Adam( params, lr=args.lr) milestones = [int(s) for s in args.schedule.split(',')] scheduler = LS.MultiStepLR(solver, milestones=milestones, gamma=args.gamma) if not os.path.exists(args.model_dir): print("Creating directory %s." % args.model_dir) os.makedirs(args.model_dir) ############### Checkpoints ############### def resume(model_name, index): names = ['encoder', 'binarizer', 'decoder', 'unet', 'd2'] for net_idx, net in enumerate(nets): if net is not None: name = names[net_idx] checkpoint_path = '{}/{}_{}_{:08d}.pth'.format( args.model_dir, model_name, name, index)
def main(args): # ***** parameters ***** batch_size = 8 num_workers = 1 num_epoch = 200 resume_epoch = 0 resume = False epoch_samples = 4421 NUM_POINTS = 2048 lr = 0.001 weigh_decay = 1e-4 milestones = [60] # [30, 60] which_dir = args.dir OBJ_CLASS = [args.cat] # load data train_dataset = PartDataset(num_ptrs=NUM_POINTS, plane_num=32, class_choice=OBJ_CLASS, random_selection=True, random_jitter=True, random_scale=True, random_translation=False, which_dir=which_dir, split='trainval') train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) test_dataset = PartDataset(num_ptrs=NUM_POINTS, plane_num=32, class_choice=OBJ_CLASS, split='test', which_dir=which_dir) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=num_workers) print('Training set size:', len(train_dataset)) print('Test set size:', len(test_dataset)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") seg_classes = train_dataset.seg_classes seg_label_to_cat = {} # {0:Airplane, 1:Airplane, ...49:Table} for cat in seg_classes.keys(): for label in seg_classes[cat]: seg_label_to_cat[label] = cat NUM_CLASS = len(seg_classes[OBJ_CLASS[0]]) # ***** specify model and log output directory ***** time_stamp = time.strftime("%Y%m%d-%H%M%S") curr_dir = os.path.dirname(os.path.abspath(__file__)) model_out_dir = '/media/pwu/Data/saved_models/point_cloud/shapepart/RCNet/' log_out_dir = os.path.join(curr_dir, 'results') try: os.makedirs(log_out_dir) except OSError: pass save_model_dir_root = check_dir( os.path.join( '/media/pwu/Data/saved_models/point_cloud/shapepart/RCNet/', 'save_' + str(which_dir))) save_model_dir_class = check_dir( os.path.join(save_model_dir_root, OBJ_CLASS[0])) save_model_dir = check_dir(os.path.join(save_model_dir_class, time_stamp)) # ***** specify logger ***** # log_dir = os.path.join(log_out_dir, 'log-' + time_stamp + '.txt') # logging.basicConfig(level=logging.INFO, # format='%(asctime)s %(message)s', # filename=log_dir, # filemode='w') save_weights_name = time_stamp # ***** build model ***** classifier = EnsembleRCNet(device, which_dir, NUM_CLASS, NUM_POINTS) print(classifier) temp = sum(p.numel() for p in classifier.parameters() if p.requires_grad) print("num_parameter", temp) # ***** load existing model ***** model_path = os.path.join(model_out_dir, 'cls_model_' + str(resume_epoch) + '.pth') if model_path != '' and resume is True: classifier.load_state_dict(torch.load(model_path)) # ***** define optimizer ***** optimizer = optim.Adam(classifier.parameters(), lr=lr, weight_decay=weigh_decay, amsgrad=False) classifier.to(device) # ***** scheduler ***** exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1) # exp_lr_scheduler = CosineLRWithRestarts(optimizer, batch_size, epoch_samples, restart_period=5, t_mult=2) num_batch = len(train_dataset) / batch_size if resume: start_epoch = resume_epoch + 1 else: start_epoch = 0 curr_shape_ious = None for epoch in range(start_epoch, num_epoch): exp_lr_scheduler.step() classifier.train() # statistic data single_shape_ious = [] for b, data in enumerate(train_dataloader): target, points, quantiles, ori_points_num, gather_idx, ori_point_idx = data target = target - seg_classes[OBJ_CLASS[0]][0] target = target.to(device) points = points.to(device) # *************************************************************** # first, prepare the input to rnn seq_data, seq_len, inverse_index = prepare_input_first_level( points, quantiles) seq_data = torch.from_numpy(seq_data.astype(np.float32)) seq_data = seq_data.to(device) # next, prepare for the data index for convolution batch_num = quantiles.shape[0] plane_num = quantiles.shape[1] items_indices = np.array([], dtype=np.int32) cnt = 0 for i in range(batch_num): plane_slice = [] for j in range(plane_num): item = [] for k in range(plane_num): num = quantiles[i, j, k] if num != 0: items_indices = np.append(items_indices, cnt) cnt = cnt + 1 # *************************************************************** optimizer.zero_grad() pred = classifier(points, quantiles, seq_data, seq_len, inverse_index, items_indices, gather_idx, ori_point_idx) loss = F.cross_entropy( pred.view(-1, NUM_CLASS), target.view(-1) ) # should use nll_loss, but seems like there is no difference? loss.backward() optimizer.step() # compute ious cur_pred_val_logits = pred.data.cpu().numpy() cur_pred_val = np.zeros( (pred.size(0), NUM_POINTS)).astype(np.int32) ori_points_num = ori_points_num.numpy().squeeze().tolist() target = target.data.cpu().numpy() for i in range(pred.size(0)): logits = cur_pred_val_logits[i, :, :] cur_pred_val[i, 0:ori_points_num[i]] = np.argmax( logits, 1)[0:ori_points_num[i]] for i in range(pred.size(0)): segp = cur_pred_val[i, 0:ori_points_num[i]] segl = target[i, 0:ori_points_num[i]] cat = OBJ_CLASS[0] part_ious = [0.0 for _ in range(NUM_CLASS)] for l in range(NUM_CLASS): if (np.sum(segl == l) == 0) and ( np.sum(segp == l) == 0 ): # part is not present, no prediction as well part_ious[l] = 1.0 else: part_ious[l] = np.sum( (segl == l) & (segp == l)) / float( np.sum((segl == l) | (segp == l))) single_shape_ious.append(np.mean(part_ious)) curr_shape_ious = np.mean(single_shape_ious) msg = '[{0:d}: {1:d}/{2:d}] mean IoUs: {3:f}'.format( epoch, b, trunc(num_batch), curr_shape_ious) print(msg) curr_shape_ious = np.mean(single_shape_ious) msg = '*** train epoch {}, mean IoUs: {}'.format( epoch, curr_shape_ious) # logging.info(msg) print(msg) # evaluate single_shape_ious = [] classifier.eval() ttime = [] for b, data in enumerate(test_dataloader): target, points, quantiles, ori_points_num, gather_idx, ori_point_idx = data target = target - seg_classes[OBJ_CLASS[0]][0] target = target.to(device) points = points.to(device) # *************************************************************** # first, prepare the input to rnn seq_data, seq_len, inverse_index = prepare_input_first_level( points, quantiles) seq_data = torch.from_numpy(seq_data.astype(np.float32)) seq_data = seq_data.to(device) # next, prepare for the data index for convolution batch_num = quantiles.shape[0] plane_num = quantiles.shape[1] items_indices = np.array([], dtype=np.int32) cnt = 0 for i in range(batch_num): plane_slice = [] for j in range(plane_num): item = [] for k in range(plane_num): num = quantiles[i, j, k] if num != 0: items_indices = np.append(items_indices, cnt) cnt = cnt + 1 # *************************************************************** start = timeit.default_timer() pred = classifier(points, quantiles, seq_data, seq_len, inverse_index, items_indices, gather_idx, ori_point_idx) stop = timeit.default_timer() print("time >>", stop - start) ttime.append(stop - start) # compute ious cur_pred_val_logits = pred.data.cpu().numpy() cur_pred_val = np.zeros( (pred.size(0), NUM_POINTS)).astype(np.int32) ori_points_num = ori_points_num.numpy().squeeze().tolist() target = target.data.cpu().numpy() for i in range(pred.size(0)): logits = cur_pred_val_logits[i, :, :] cur_pred_val[i, 0:ori_points_num[i]] = np.argmax( logits, 1)[0:ori_points_num[i]] for i in range(pred.size(0)): segp = cur_pred_val[i, 0:ori_points_num[i]] segl = target[i, 0:ori_points_num[i]] cat = OBJ_CLASS[0] part_ious = [0.0 for _ in range(NUM_CLASS)] for l in range(NUM_CLASS): if (np.sum(segl == l) == 0) and ( np.sum(segp == l) == 0 ): # part is not present, no prediction as well part_ious[l] = 1.0 else: part_ious[l] = np.sum( (segl == l) & (segp == l)) / float( np.sum((segl == l) | (segp == l))) single_shape_ious.append(np.mean(part_ious)) curr_shape_ious = np.mean(single_shape_ious) msg = '*** Test mean IoUs: {0:f}'.format(curr_shape_ious) # logging.info(msg) print(msg) #if epoch % 10 == 0: # torch.save(classifier.state_dict(), '{}/{}.pth'.format(save_model_dir, curr_shape_ious)) # logging.info(msg) # torch.save(classifier.state_dict(), '%s/cls_model_%d.pth' % (model_out_dir, epoch)) return curr_shape_ious
# {'params': model.classifierD0.parameters(), 'lr': opt.lr}, # {'params': model.classifierB3.parameters(), 'lr': opt.lr}, # {'params': model.classifierB4.parameters(), 'lr': opt.lr}, # {'params': model.classifierB5.parameters(), 'lr': opt.lr}, # {'params': model.classifierC2.parameters(), 'lr': opt.lr}, # {'params': model.classifierC3.parameters(), 'lr': opt.lr}, # {'params': model.classifier.parameters(), 'lr': opt.lr}, # ], weight_decay=5e-4, momentum=0.9, nesterov=True) # Decay LR by a factor of 0.1 every 40 epochs exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[40, 70], gamma=0.1) ###################################################################### # Train and evaluate # -------- # dir_name = os.path.join('./logs', opt.name) if not os.path.isdir(dir_name): os.mkdir(dir_name) # record every run copyfile('./train_irid.py', dir_name + '/train_irid.py') copyfile('models/base_model.py', dir_name + '/base_model.py') if opt.LSTM: copyfile('models/lstm_model.py', dir_name + '/lstm_model.py')
model_ft = models.resnet18(pretrained=True) num_ftrs = model_ft.fc.in_features model_ft.fc = nn.Linear(num_ftrs, 6) if use_gpu: model_ft = model_ft.cuda() criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9) # Decay LR by a factor of 0.1 every 7 epochs # exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, [55, 75], gamma=0.5, last_epoch=-1) ''' Result recording ''' result_dir = 'result/1234' # folder if not os.path.exists(result_dir): os.makedirs(result_dir) # image sample display model_ft = train_model(model_ft, optimizer_ft, exp_lr_scheduler, num_epochs=100)
def __init__(self, opt): super(SRFeatModel, self).__init__(opt) train_opt = opt['train'] # define networks and load pretrained models self.netG = networks.define_G(opt).to(self.device) # G if self.is_train: self.netD1 = networks.define_D(opt).to(self.device) # D self.netD2 = networks.define_DF(opt).to(self.device) # D self.netG.train() self.netD1.train() self.netD2.train() self.load() # load G and D if needed # define losses, optimizer and scheduler if self.is_train: # G pixel loss if train_opt['pixel_weight'] > 0: l_pix_type = train_opt['pixel_criterion'] if l_pix_type == 'l1': self.cri_pix = nn.L1Loss().to(self.device) elif l_pix_type == 'l2': self.cri_pix = nn.MSELoss().to(self.device) else: raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_pix_type)) self.l_pix_w = train_opt['pixel_weight'] else: logger.info('Remove pixel loss.') self.cri_pix = None # G feature loss if train_opt['feature_weight'] > 0: l_fea_type = train_opt['feature_criterion'] if l_fea_type == 'l1': self.cri_fea = nn.L1Loss().to(self.device) elif l_fea_type == 'l2': self.cri_fea = nn.MSELoss().to(self.device) else: raise NotImplementedError('Loss type [{:s}] not recognized.'.format(l_fea_type)) self.l_fea_w = train_opt['feature_weight'] else: logger.info('Remove feature loss.') self.cri_fea = None if self.cri_fea: # load VGG perceptual loss self.netF = networks.define_F(opt, use_bn=False).to(self.device) # GD gan loss self.cri_gan = GANLoss(train_opt['gan_type'], 1.0, 0.0).to(self.device) self.l_gan_w = train_opt['gan_weight'] # D_update_ratio and D_init_iters are for WGAN self.D_update_ratio = train_opt['D_update_ratio'] if train_opt['D_update_ratio'] else 1 self.D_init_iters = train_opt['D_init_iters'] if train_opt['D_init_iters'] else 0 if train_opt['gan_type'] == 'wgan-gp': self.random_pt = torch.Tensor(1, 1, 1, 1).to(self.device) # gradient penalty loss self.cri_gp = GradientPenaltyLoss(device=self.device).to(self.device) self.l_gp_w = train_opt['gp_weigth'] # optimizers # G wd_G = train_opt['weight_decay_G'] if train_opt['weight_decay_G'] else 0 optim_params = [] for k, v in self.netG.named_parameters(): # can optimize for a part of the model if v.requires_grad: optim_params.append(v) else: logger.warning('Params [{:s}] will not optimize.'.format(k)) self.optimizer_G = torch.optim.Adam(optim_params, lr=train_opt['lr_G'], \ weight_decay=wd_G, betas=(train_opt['beta1_G'], 0.999)) self.optimizers.append(self.optimizer_G) # D1 and D2 wd_D = train_opt['weight_decay_D'] if train_opt['weight_decay_D'] else 0 self.optimizer_D1 = torch.optim.Adam(self.netD1.parameters(), lr=train_opt['lr_D'], \ weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999)) self.optimizers.append(self.optimizer_D1) self.optimizer_D2 = torch.optim.Adam(self.netD2.parameters(), lr=train_opt['lr_D'], \ weight_decay=wd_D, betas=(train_opt['beta1_D'], 0.999)) self.optimizers.append(self.optimizer_D2) # schedulers if train_opt['lr_scheme'] == 'MultiStepLR': for optimizer in self.optimizers: self.schedulers.append(lr_scheduler.MultiStepLR(optimizer, \ train_opt['lr_steps'], train_opt['lr_gamma'])) else: raise NotImplementedError('MultiStepLR learning rate scheme is enough.') self.log_dict = OrderedDict() # print network self.print_network()
#checkpoint = torch.load('./model/model_mulstep.pth') net = googlenet() net = nn.DataParallel(net) #net.load_state_dict(checkpoint) #net.load_state_dict(torch.load('./model/model_mulstep.pth')) #net = torch.load('./model/model_mulstep.pth') #net = nn.DataParallel(net) net.cuda() # 定义损失函数和优化方式 criterion = nn.CrossEntropyLoss() #损失函数为交叉熵,多用于多分类问题 optimizer = optim.SGD( net.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4) #优化方式为mini-batch momentum-SGD,并采用L2正则化(权重衰减) lr_schedule = lr_scheduler.MultiStepLR(optimizer, [60, 80], 0.1) #lr_schedule = lr_scheduler.ExponentialLR(optimizer,gamma=1) # 训练 if __name__ == "__main__": #if not os.path.exists(args.outf): #os.makedirs(args.outf) best_acc = 0 #2 初始化best test accuracy print("Start Training, googlenet!") # 定义遍历数据集的次数 with open("ci-acc16.txt", "w") as f: with open("ci-log16.txt", "w") as f2: for epoch in range(0, EPOCH): print('\nEpoch: %d' % (epoch + 1)) net.train() sum_loss = 0.0 correct = 0.0
def experiment(args): # load data data = pd.read_pickle(os.path.join(args.dataset_root, 'dataset.pkl')) data = data[data.sensor.str.contains('|'.join(args.sensors))] # filter sensors # load splits splits = pd.read_csv(os.path.join(args.dataset_root, 'train_test_split.csv')) train_patients = splits[splits.split.str.contains('train')].patient_hash.tolist() test_patients = splits[splits.split.str.contains('test')].patient_hash.tolist() # get data accorting to patient split train_data = data[data.patient_hash.str.contains('|'.join(train_patients))] test_data = data[data.patient_hash.str.contains('|'.join(test_patients))] # subset the dataset train_dataset = COVID19Dataset(args, train_data, get_transforms(args, 'train')) test_dataset = COVID19Dataset(args, test_data, get_transforms(args, 'test')) # For unbalanced dataset we create a weighted sampler train_labels = [sum(l) for l in train_data.label.tolist()] weights = get_weights_for_balanced_classes(train_labels, len(list(set(train_labels)))) weights = torch.DoubleTensor(weights) sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=weights, num_samples=len(weights)) nclasses = len(list(set(train_labels))) # dataloaders from subsets train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, sampler=sampler, num_workers=args.num_workers, drop_last=True) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=args.num_workers, drop_last=False) # create directories args.weights_dir = os.path.join('logs', args.run_name, 'weights') os.makedirs(args.weights_dir, exist_ok=True) args.train_viz_dir = os.path.join('logs', args.run_name, 'viz_train') os.makedirs(args.train_viz_dir, exist_ok=True) args.test_viz_dir = os.path.join('logs', args.run_name, 'viz_test') os.makedirs(args.test_viz_dir, exist_ok=True) model = CNNConStn(args.img_size, nclasses, args.fixed_scale) print(model) print('Number of params in the model: {}'.format( *[sum([p.data.nelement() for p in net.parameters()]) for net in [model]])) model = model.cuda() # fixed samples for stn visualization fixed_samples_iter = iter(train_loader) fixed_samples_train, fixed_y_train = fixed_samples_iter.next() fixed_samples_iter = iter(test_loader) fixed_samples_test, _ = fixed_samples_iter.next() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[70], gamma=0.1) # 10, 50 state_dict = {'best_f1': 0., 'precision': 0., 'recall': 0., 'accuracy': 0.} for epoch in range(args.epochs): model = train(args, model, train_loader, len(list(set(train_labels))), optimizer, epoch, fixed_samples_train, fixed_y_train) test(args, model, test_loader, len(list(set(train_labels))), epoch, state_dict, args.weights_dir, fixed_samples_test) exp_lr_scheduler.step()
def train(opt): """ dataset preparation """ if opt.select_data == 'baidu': train_set = BAIDUset(opt, opt.train_csv) train_loader = torch.utils.data.DataLoader( train_set, batch_size=opt.batch_size, shuffle=True, num_workers=int(opt.workers), collate_fn=BaiduCollate(opt.imgH, opt.imgW, keep_ratio=False)) val_set = BAIDUset(opt, opt.val_csv) valid_loader = torch.utils.data.DataLoader( val_set, batch_size=opt.batch_size, shuffle=True, num_workers=int(opt.workers), collate_fn=BaiduCollate(opt.imgH, opt.imgW, keep_ratio=False), pin_memory=True) else: opt.select_data = opt.select_data.split('-') opt.batch_ratio = opt.batch_ratio.split('-') train_dataset = Batch_Balanced_Dataset(opt) AlignCollate_valid = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) valid_dataset = hierarchical_dataset(root=opt.valid_data, opt=opt) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=opt.batch_size, shuffle= True, # 'True' to check training progress with validation function. num_workers=int(opt.workers), collate_fn=AlignCollate_valid, pin_memory=True) print('-' * 80) """ model configuration """ if 'CTC' in opt.Prediction: converter = CTCLabelConverter(opt.character) elif 'Bert' in opt.Prediction: converter = TransformerConverter(opt.character, opt.max_seq) elif 'SRN' in opt.Prediction: converter = SRNConverter(opt.character, opt.SRN_PAD) else: converter = AttnLabelConverter(opt.character) opt.num_class = len(converter.character) if opt.rgb: opt.input_channel = 3 model = Model(opt) print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel, opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction) # weight initialization for name, param in model.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU model = torch.nn.DataParallel(model).cuda() model.train() if opt.continue_model != '': print(f'loading pretrained model from {opt.continue_model}') model.load_state_dict(torch.load(opt.continue_model)) print("Model:") print(model) """ setup loss """ if 'CTC' in opt.Prediction: criterion = torch.nn.CTCLoss(zero_infinity=True).cuda() elif 'Bert' in opt.Prediction: criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda() elif 'SRN' in opt.Prediction: criterion = cal_performance else: criterion = torch.nn.CrossEntropyLoss( ignore_index=0).cuda() # ignore [GO] token = ignore index 0 # loss averager loss_avg = Averager() # filter that only require gradient decent filtered_parameters = [] params_num = [] for p in filter(lambda p: p.requires_grad, model.parameters()): filtered_parameters.append(p) params_num.append(np.prod(p.size())) print('Trainable params num : ', sum(params_num)) # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())] # setup optimizer if opt.adam: optimizer = optim.Adam(filtered_parameters, lr=opt.lr, betas=(opt.beta1, 0.999)) elif opt.ranger: optimizer = Ranger(filtered_parameters, lr=opt.lr) else: optimizer = optim.Adadelta(filtered_parameters, lr=opt.lr, rho=opt.rho, eps=opt.eps) print("Optimizer:") print(optimizer) lrScheduler = lr_scheduler.MultiStepLR(optimizer, [5, 20, 30], gamma=0.5) # 减小学习速率 """ final options """ # print(opt) with open(f'./saved_models/{opt.experiment_name}/opt.txt', 'a') as opt_file: opt_log = '------------ Options -------------\n' args = vars(opt) for k, v in args.items(): opt_log += f'{str(k)}: {str(v)}\n' opt_log += '---------------------------------------\n' print(opt_log) opt_file.write(opt_log) """ start training """ start_iter = 0 if opt.continue_model != '': start_iter = int(opt.continue_model.split('_')[-1].split('.')[0]) print(f'continue to train, start_iter: {start_iter}') start_time = time.time() best_accuracy = -1 best_norm_ED = 1e+6 i = start_iter if opt.select_data == 'baidu': train_iter = iter(train_loader) step_per_epoch = len(train_set) / opt.batch_size print('一代有多少step:', step_per_epoch) else: step_per_epoch = train_dataset.nums_samples / opt.batch_size print('一代有多少step:', step_per_epoch) while (True): # try: # train part for p in model.parameters(): p.requires_grad = True if opt.select_data == 'baidu': try: image_tensors, labels = train_iter.next() except: train_iter = iter(train_loader) image_tensors, labels = train_iter.next() else: image_tensors, labels = train_dataset.get_batch() image = image_tensors.cuda() text, length = converter.encode(labels) batch_size = image.size(0) if 'CTC' in opt.Prediction: preds = model(image, text).log_softmax(2) preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCLoss format cost = criterion(preds, text, preds_size, length) elif 'Bert' in opt.Prediction: pad_mask = None # print(image.shape) preds = model(image, pad_mask) cost = criterion(preds[0].view(-1, preds[0].shape[-1]), text.contiguous().view(-1)) + \ criterion(preds[1].view(-1, preds[1].shape[-1]), text.contiguous().view(-1)) elif 'SRN' in opt.Prediction: preds = model(image, None) cost, n_correct = criterion(preds, text) else: preds = model(image, text[:, :-1]) # align with Attention.forward target = text[:, 1:] # without [GO] Symbol cost = criterion(preds.view(-1, preds.shape[-1]), target.contiguous().view(-1)) model.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), opt.grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) if i % opt.disInterval == 0: elapsed_time = time.time() - start_time print( f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}' ) start_time = time.time() # validation part if i % opt.valInterval == 0 and i > start_iter: elapsed_time = time.time() - start_time print( f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}' ) # for log with open(f'./saved_models/{opt.experiment_name}/log_train.txt', 'a') as log: log.write( f'[{i}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n' ) loss_avg.reset() model.eval() valid_loss, current_accuracy, current_norm_ED, preds, labels, infer_time, length_of_data = validation( model, criterion, valid_loader, converter, opt) model.train() for pred, gt in zip(preds[:5], labels[:5]): if 'Attn' in opt.Prediction: pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] print( f'pred: {pred:20s}, gt: {gt:20s}, {str(pred == gt)}') log.write( f'pred: {pred:20s}, gt: {gt:20s}, {str(pred == gt)}\n' ) valid_log = f'[{i}/{opt.num_iter}] valid loss: {valid_loss:0.5f}' valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}' print(valid_log) log.write(valid_log + '\n') # keep best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_accuracy.pth' ) if current_norm_ED < best_norm_ED: best_norm_ED = current_norm_ED torch.save( model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth' ) best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}' print(best_model_log) log.write(best_model_log + '\n') # save model per 1e+5 iter. if (i + 1) % opt.saveInterval == 0: torch.save(model.state_dict(), f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth') if i == opt.num_iter: print('end the training') sys.exit() if i > 0 and i % step_per_epoch == 0: # 调整学习速率 lrScheduler.step() i += 1
model_weights = os.listdir(save_root) model_weights.remove("optimizer.pt") model_weights.sort() last_model_name = model_weights[-1] model.load_state_dict(torch.load(PJ(save_root, last_model_name))) optimizer.load_state_dict(torch.load(PJ(save_root, "optimizer.pt"))) print(f"Loading model {last_model_name} successed!") print(f"Loading optimizer successed!") iteration, last_epoch = int(last_model_name[-11:-3]), int( last_model_name[:4]) iteration, last_epoch = (iteration, last_epoch) if resume else (0, 1) # Learning rate decay scheduler scheduler = lr_scheduler.MultiStepLR(optimizer, config["step_size"], config["gamma"], last_epoch=last_epoch - 2) ######################################## # Start training ######################################## print("\n> Training") for epoch in range(last_epoch, config["max_epoch"] + 1): # scheduler step in each epoch scheduler.step() for it, (labels, images) in enumerate(trainloader): optimizer.zero_grad() # Drop images and labels into GPU images = images.cuda().detach() labels = labels.cuda().detach()
model = NormalizedModel(model=m, mean=image_mean, std=image_std).to( DEVICE) # keep images in the [0, 1] range if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.adv == 0: scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_step, gamma=args.lr_decay) else: scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) attacker = DDN(steps=args.steps, device=DEVICE) max_loss = torch.log(torch.tensor(10.)).item() # for callback best_acc = 0 best_epoch = 0 for epoch in range(args.epochs): scheduler.step() cudnn.benchmark = True model.train() requires_grad_(m, True) accs = AverageMeter() losses = AverageMeter()
def train(cfg): # prepare dataset train_loader, val_loader, num_query, num_classes, num_classes2, image_map_label2 = make_data_loader( cfg) #print('\n\n*** image_map_label2:') # prepare model model = build_model(cfg, num_classes, num_classes2) #print(list(model.children())) #print(model.state_dict().keys()) #exit(0) #print('model.named_children(): \n\n', model.named_children()) ''' kk = 1 for name, child in model.base.named_children(): print(kk, name) kk += 1 print(len(list(model.base.children()))) exit(0) for i in range(len(list(model.base.children()))): print(' +++', i+1) print(list(model.base.children())[i]) exit(0) ''' if len(cfg.MODEL.PRETRAIN_PATH2) > 5: print('--- resume from ', cfg.MODEL.PRETRAIN_PATH2) #model.load_param(cfg.MODEL.PRETRAIN_PATH) #model.loiad_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH2, map_location=lambda storage, loc: storage)) if cfg.MODEL.ONCE_LOAD == 'yes': print('\n---ONCE_LOAD...\n') model.load_state_dict( torch.load(cfg.MODEL.PRETRAIN_PATH2, map_location=lambda storage, loc: storage)) #if cfg.MODEL.FREEZE_BASE == 'yes': # functions.freeze_layer(model, 'base', False) #functions.freeze_global_model(model, False) else: functions.load_state_dict_distill(model, cfg.MODEL.PRETRAIN_PATH2, cfg.MODEL.ONLY_BASE, cfg.MODEL.WITHOUT_FC) print('**** Successfully load ', cfg.MODEL.PRETRAIN_PATH2) if cfg.MODEL.FREEZE_BASE: #functions.freeze_layer(model, 'base', False) functions.freeze_global_model(model, False) if cfg.MODEL.IF_WITH_CENTER == 'no': print('Train without center loss, the loss type is', cfg.MODEL.METRIC_LOSS_TYPE) if cfg.SOLVER.MY_OPTIMIZER == "yes": print('---* my optimizer:', cfg.SOLVER.MY_OPTIMIZER_NAME) other_params = [ p for n, p in model.named_parameters() if not n.startswith('base') ] optimizer = optim.SGD([{ 'params': model.base.parameters(), 'lr': cfg.SOLVER.LR / 10 }, { 'params': other_params, 'lr': cfg.SOLVER.LR }], momentum=0.9, weight_decay=5e-4, nesterov=True) else: print('---* not my optimizer') optimizer = make_optimizer(cfg, model) # scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, # cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) #_C.SOLVER.MY_SCHEDULER = "no" #_C.SOLVER.MY_WARMUP = "no" loss_func = make_loss(cfg, num_classes) # modified by gu # Add for using self trained model if cfg.MODEL.PRETRAIN_CHOICE == 'self': start_epoch = eval( cfg.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_') [-1]) print('Start epoch:', start_epoch) path_to_optimizer = cfg.MODEL.PRETRAIN_PATH.replace( 'model', 'optimizer') print('Path to the checkpoint of optimizer:', path_to_optimizer) model.load_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH)) optimizer.load_state_dict(torch.load(path_to_optimizer)) scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD, start_epoch) elif cfg.MODEL.PRETRAIN_CHOICE == 'imagenet': start_epoch = 0 if cfg.SOLVER.MY_SCHEDULER == "yes": print('cfg.SOLVER.MY_SCHEDULER_STEP:', cfg.SOLVER.MY_SCHEDULER_STEP) print('---* my scheduler: ', cfg.SOLVER.MY_SCHEDULER_NAME) if cfg.SOLVER.MY_SCHEDULER_NAME == 'SL': scheduler = lr_scheduler.StepLR( optimizer, step_size=cfg.SOLVER.MY_SCHEDULER_STEP[0], gamma=0.1) elif cfg.SOLVER.MY_SCHEDULER_NAME == 'MSL': scheduler = lr_scheduler.MultiStepLR( optimizer, cfg.SOLVER.MY_SCHEDULER_STEP, gamma=0.1) else: print(cfg.SOLVER.MY_SCHEDULER_NAME, ' not found!') eixt(0) else: print('---* not my scheduler') scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) else: print( 'Only support pretrain_choice for imagenet and self, but got {}' .format(cfg.MODEL.PRETRAIN_CHOICE)) arguments = {} print('************ do_train') do_train( cfg, model, train_loader, val_loader, optimizer, scheduler, # modify for using self trained model loss_func, num_query, start_epoch, # add for using self trained model image_map_label2, num_classes2) # elif cfg.MODEL.IF_WITH_CENTER == 'yes': # print('Train with center loss, the loss type is', cfg.MODEL.METRIC_LOSS_TYPE) # loss_func, center_criterion = make_loss_with_center(cfg, num_classes) # modified by gu # optimizer, optimizer_center = make_optimizer_with_center(cfg, model, center_criterion) # # scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, # # cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) # # arguments = {} # # # Add for using self trained model # if cfg.MODEL.PRETRAIN_CHOICE == 'self': # start_epoch = eval(cfg.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_')[-1]) # print('Start epoch:', start_epoch) # path_to_optimizer = cfg.MODEL.PRETRAIN_PATH.replace('model', 'optimizer') # print('Path to the checkpoint of optimizer:', path_to_optimizer) # path_to_optimizer_center = cfg.MODEL.PRETRAIN_PATH.replace('model', 'optimizer_center') # print('Path to the checkpoint of optimizer_center:', path_to_optimizer_center) # model.load_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH)) # optimizer.load_state_dict(torch.load(path_to_optimizer)) # optimizer_center.load_state_dict(torch.load(path_to_optimizer_center)) # scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, # cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD, start_epoch) # elif cfg.MODEL.PRETRAIN_CHOICE == 'imagenet': # start_epoch = 0 # scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, # cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) # else: # print('Only support pretrain_choice for imagenet and self, but got {}'.format(cfg.MODEL.PRETRAIN_CHOICE)) # # do_train_with_center( # cfg, # model, # center_criterion, # train_loader, # val_loader, # optimizer, # optimizer_center, # scheduler, # modify for using self trained model # loss_func, # num_query, # start_epoch # add for using self trained model # ) else: print( "Unsupported value for cfg.MODEL.IF_WITH_CENTER {}, only support yes or no!\n" .format(cfg.MODEL.IF_WITH_CENTER))
def train(self, train_queue, val_queue=None): ''' Given data queues, train the network ''' # Parameter directory save_dir = os.path.join(cfg.DIR.OUT_PATH) if not os.path.exists(save_dir): os.makedirs(save_dir) # Timer for the training op and parallel data loading op. train_timer = Timer() data_timer = Timer() training_losses = [] # Setup learning rates lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()] #Setup the lr_scheduler self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer, lr_steps, gamma=0.1) start_iter = 0 # Resume training if cfg.TRAIN.RESUME_TRAIN: self.load(cfg.CONST.WEIGHTS) start_iter = cfg.TRAIN.INITIAL_ITERATION # Main training loop for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1): self.lr_scheduler.step() data_timer.tic() batch_img, batch_voxel = train_queue.get() data_timer.toc() if self.net.is_x_tensor4: batch_img = batch_img[0] # Apply one gradient step train_timer.tic() loss = self.train_loss(batch_img, batch_voxel) train_timer.toc() training_losses.append(loss.data[0]) # Decrease learning rate at certain points if train_ind in lr_steps: #for pytorch optimizer, learning rate can only be set when the optimizer is created #or using torch.optim.lr_scheduler print('Learing rate decreased to %f: ' % cfg.TRAIN.LEARNING_RATES[str(train_ind)]) # Debugging modules # # Print status, run validation, check divergence, and save model. if train_ind % cfg.TRAIN.PRINT_FREQ == 0: # Print the current loss print('%s Iter: %d Loss: %f' % (datetime.now(), train_ind, loss)) if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None: # Print test loss and params to check convergence every N iterations val_losses = 0 for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS): batch_img, batch_voxel = val_queue.get() val_loss = self.train_loss(batch_img, batch_voxel) val_losses += val_loss var_losses_mean = val_losses / cfg.TRAIN.NUM_VALIDATION_ITERATIONS print('%s Test loss: %f' % (datetime.now(), var_losses_mean)) if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0: # Check that the network parameters are all valid nan_or_max_param = max_or_nan(self.net.parameters()) if has_nan(nan_or_max_param): print('NAN detected') break if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0: self.save(training_losses, save_dir, train_ind) #loss is a Variable containing torch.FloatTensor of size 1 if loss.data[0] > cfg.TRAIN.LOSS_LIMIT: print("Cost exceeds the threshold. Stop training") break
) base_params = filter(lambda p: id(p) not in ignored_params, model.parameters()) optimizer_ft = optim.SGD([ {'params': base_params, 'lr': 0.1*opt.lr}, {'params': model.classifier0.parameters(), 'lr': opt.lr}, {'params': model.classifier1.parameters(), 'lr': opt.lr}, {'params': model.classifier2.parameters(), 'lr': opt.lr}, {'params': model.classifier3.parameters(), 'lr': opt.lr}, ], weight_decay=5e-4, momentum=0.9, nesterov=True) if opt.adam: optimizer_ft = optim.Adam(model.parameters(), opt.lr, weight_decay=5e-4) # Decay LR by a factor of 0.1 every 40 epochs #exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=40, gamma=0.1) exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, milestones=[60-start_epoch, 75-start_epoch], gamma=0.1) ###################################################################### # Train and evaluate # ^^^^^^^^^^^^^^^^^^ # # It should take around 1-2 hours on GPU. # dir_name = os.path.join('./data/outputs',name) if not opt.resume: if not os.path.isdir(dir_name): os.mkdir(dir_name) #record every run copyfile('./train.py', dir_name+'/train.py') copyfile('./model.py', dir_name+'/model.py')
num_params = sum(p.numel() for p in net.parameters() if p.requires_grad) print('The number of parameters of model is', num_params) if args.resume is not None: checkpoint = torch.load('./save_model/' + args.resume) net.load_state_dict(checkpoint['net']) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) decay_epoch = [150, 225] scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=decay_epoch, gamma=0.1) writer = SummaryWriter(args.logdir) def train(epoch, global_steps): net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(train_loader): global_steps += 1 inputs = inputs.to(device)
def train(): cfg = opt.cfg data = opt.data img_size = opt.img_size epochs = 1 if opt.prebias else opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights if 'pw' not in opt.arc: # remove BCELoss positive weights hyp['cls_pw'] = 1. hyp['obj_pw'] = 1. # Initialize init_seeds() if opt.multi_scale: img_sz_min = round(img_size / 32 / 1.5) img_sz_max = round(img_size / 32 * 1.5) img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = int(data_dict['classes']) # number of classes # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg, arc=opt.arc).to(device) # Optimizer pg0, pg1 = [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if 'Conv2d.weight' in k: pg1 += [v] # parameter group 1 (apply weight_decay) else: pg0 += [v] # parameter group 0 if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay del pg0, pg1 # https://github.com/alphadl/lookahead.pytorch # optimizer = torch_utils.Lookahead(optimizer, k=5, alpha=0.5) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_fitness = float('inf') attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. cutoff = load_darknet_weights(model, weights) if opt.transfer or opt.prebias: # transfer learning edge (yolo) layers nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) if opt.prebias: for p in optimizer.param_groups: # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum p['lr'] *= 100 # lr gain if p.get('momentum') is not None: # for SGD but not Adam p['momentum'] *= 0.9 for p in model.parameters(): if opt.prebias and p.numel() == nf: # train (yolo biases) p.requires_grad = True elif opt.transfer and p.shape[ 0] == nf: # train (yolo biases+weights) p.requires_grad = True else: # freeze layer p.requires_grad = False # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8) # gradual fall to 0.1*lr0 scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count() > 1: dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training image_weights=opt.img_weights, cache_labels=epochs > 10, cache_images=opt.cache_images and not opt.prebias) # Dataloader batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=not opt. rect, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Test Dataloader if not opt.prebias: testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, img_size, batch_size, hyp=hyp, rect=True, cache_labels=True, cache_images=opt.cache_images), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Start training nb = len(dataloader) model.nc = nc # attach number of classes to model model.arc = opt.arc # attach yolo architecture model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() torch_utils.model_info(model, report='summary') # 'full' or 'summary' print('Using %g dataloader workers' % nw) print('Starting %s for %g epochs...' % ('prebias' if opt.prebias else 'training', epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) freeze_backbone = False if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Multi-Scale training if opt.multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes if ni == 0: fname = 'train_batch%g.jpg' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # Print batch results mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available( ) else 0 # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results final_epoch = epoch + 1 == epochs if opt.prebias: print_model_biases(model) else: # Calculate mAP if not opt.notest or final_epoch: with torch.no_grad(): results, maps = test.test( cfg, data, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch and epoch > 0 else 0.1, # 0.1 for speed save_json=final_epoch and epoch > 0 and 'coco.data' in data, dataloader=testloader) # Write epoch results with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket and not opt.prebias: os.system('gsutil cp results.txt gs://%s/results%s.txt' % (opt.bucket, opt.name)) # Write Tensorboard results if tb_writer: x = list(mloss) + list(results) titles = [ 'GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best mAP fitness = sum(results[4:]) # total loss if fitness < best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) or opt.prebias if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) # Save best checkpoint if best_fitness == fitness: torch.save(chkpt, best) # # Save backup every 10 epochs (optional) # if epoch > 0 and epoch % 10 == 0: # torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if len(opt.name) and not opt.prebias: fresults, flast, fbest = 'results%s.txt' % opt.name, 'last%s.pt' % opt.name, 'best%s.pt' % opt.name os.rename('results.txt', fresults) os.rename(wdir + 'last.pt', wdir + flast) if os.path.exists(wdir + 'last.pt') else None os.rename(wdir + 'best.pt', wdir + fbest) if os.path.exists(wdir + 'best.pt') else None # save to cloud if opt.bucket: os.system('gsutil cp %s %s gs://%s' % (fresults, wdir + flast, opt.bucket)) plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def main(arg_seed, arg_timestamp): random_seed = arg_seed np.random.seed(random_seed) random.seed(random_seed) torch.manual_seed(random_seed) torch.cuda.manual_seed(random_seed) torch.cuda.manual_seed_all(random_seed) torch.backends.cudnn.deterministic = True # need to set to True as well print('Random Seed {}\n'.format(arg_seed)) # -- training parameters num_epoch = args.epoch milestone = [50, 75] batch_size = args.batch num_workers = 2 weight_decay = 1e-3 gamma = 0.2 current_delta = args.delta lr = args.lr start_epoch = 0 # -- specify dataset # data augmentation transform_train = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) trainset = Animal10(split='train', transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers, worker_init_fn=_init_fn, drop_last=True) testset = Animal10(split='test', transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size * 4, shuffle=False, num_workers=num_workers) num_class = 10 print('train data size:', len(trainset)) print('test data size:', len(testset)) # -- create log file if arg_timestamp: time_stamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'Ours(' + time_stamp + ').txt' else: file_name = 'Ours.txt' log_dir = check_folder('logs') file_name = os.path.join(log_dir, file_name) saver = open(file_name, "w") saver.write(args.__repr__() + "\n\n") saver.flush() # -- set network, optimizer, scheduler, etc net = vgg19_bn(num_classes=num_class, pretrained=False) net = nn.DataParallel(net) optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay=weight_decay) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = net.to(device) exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=milestone, gamma=gamma) criterion = torch.nn.CrossEntropyLoss() # -- misc iterations = 0 f_record = torch.zeros([args.rollWindow, len(trainset), num_class]) for epoch in range(start_epoch, num_epoch): train_correct = 0 train_loss = 0 train_total = 0 net.train() for i, (images, labels, indices) in enumerate(trainloader): if images.size(0) == 1: # when batch size equals 1, skip, due to batch normalization continue images, labels = images.to(device), labels.to(device) outputs = net(images) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() train_total += images.size(0) _, predicted = outputs.max(1) train_correct += predicted.eq(labels).sum().item() f_record[epoch % args.rollWindow, indices] = F.softmax(outputs.detach().cpu(), dim=1) iterations += 1 if iterations % 100 == 0: cur_train_acc = train_correct / train_total * 100. cur_train_loss = train_loss / train_total cprint('epoch: {}\titerations: {}\tcurrent train accuracy: {:.4f}\ttrain loss:{:.4f}'.format( epoch, iterations, cur_train_acc, cur_train_loss), 'yellow') if iterations % 5000 == 0: saver.write('epoch: {}\titerations: {}\ttrain accuracy: {}\ttrain loss: {}\n'.format( epoch, iterations, cur_train_acc, cur_train_loss)) saver.flush() train_acc = train_correct / train_total * 100. cprint('epoch: {}'.format(epoch), 'yellow') cprint('train accuracy: {:.4f}\ntrain loss: {:.4f}'.format(train_acc, train_loss), 'yellow') saver.write('epoch: {}\ntrain accuracy: {}\ntrain loss: {}\n'.format(epoch, train_acc, train_loss)) saver.flush() exp_lr_scheduler.step() if epoch >= args.warm_up: f_x = f_record.mean(0) y_tilde = trainset.targets y_corrected, current_delta = lrt_correction(y_tilde, f_x, current_delta=current_delta, delta_increment=0.1) logging.info('Current delta:\t{}\n'.format(current_delta)) trainset.update_corrupted_label(y_corrected) # testing net.eval() test_total = 0 test_correct = 0 with torch.no_grad(): for i, (images, labels, _) in enumerate(testloader): images, labels = images.to(device), labels.to(device) outputs = net(images) test_total += images.size(0) _, predicted = outputs.max(1) test_correct += predicted.eq(labels).sum().item() test_acc = test_correct / test_total * 100. cprint('>> current test accuracy: {:.4f}'.format(test_acc), 'cyan') saver.write('>> current test accuracy: {}\n'.format(test_acc)) saver.flush() saver.close()
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt'), mode='a') print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_imgreid_dataset( name=args.dataset, split_id=args.split_id, cuhk03_labeled=args.cuhk03_labeled, cuhk03_classic_split=args.cuhk03_classic_split, ) transform_train = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.RandomHorizontalFlip(), ST.ToTensor(), ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ST.RandomErasing(0.5), ]) transform_test = T.Compose([ T.Resize((args.height, args.width), interpolation=3), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False trainloader = DataLoader( ImageDataset_seg(dataset.train, transform=transform_train), sampler=RandomIdentitySampler(dataset.train, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( ImageDataset(dataset.query, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( ImageDataset(dataset.gallery, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids) print(model) print("Model size: {:.3f} M".format(count_num_param(model))) criterion_xent = CrossEntropyLabelSmooth( num_classes=dataset.num_train_pids, use_gpu=use_gpu) criterion_htri = TripletLoss(margin=args.margin, distance=args.distance) criterion_mask = MaskLoss(mode=args.mode) optimizer = init_optim(args.optim, model.parameters(), args.lr, args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) if args.resume: if check_isfile(args.resume): checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] rank1 = checkpoint['rank1'] print("Loaded checkpoint from '{}'".format(args.resume)) print("- start_epoch: {}\n- rank1: {}".format( args.start_epoch, rank1)) if use_gpu: model = nn.DataParallel(model).cuda() if args.evaluate: print("Evaluate only") test(model, queryloader, galleryloader, use_gpu) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(args.start_epoch, args.max_epoch): scheduler.step() start_train_time = time.time() train(epoch, model, criterion_xent, criterion_htri, criterion_mask, optimizer, trainloader) train_time += round(time.time() - start_train_time) if (epoch + 1) % args.eval_step == 0 or epoch == 0: print("==> Test") rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time)) print("==========\nArgs:{}\n==========".format(args))
def main(): args = vars(parse_args_func()) #config_file = "../configs/config_SN7.json" config_file = args['config'] # "../configs/config_v1.json" config_dict = json.loads(open(config_file, 'rt').read()) #config_dict = json.loads(open(sys.argv[1], 'rt').read()) file_dict = config_dict['file_path'] config = config_dict['opt_config'] input_folder = file_dict['input_path'] # '../inputs' checkpoint_folder = file_dict['checkpoint_path'] # '../checkpoint' model_folder = file_dict['model_path'] # '../models' if 'False' in config['deep_supervision']: config['deep_supervision'] = False else: config['deep_supervision'] = True if 'False' in config['nesterov']: config['nesterov'] = False else: config['nesterov'] = True if 'None' in config['name']: config['name'] = None if config['name'] is None: config['name'] = '%s_%s_segmodel' % (config['dataset'], config['arch']) os.makedirs(os.path.join(model_folder, '%s' % config['name']), exist_ok=True) if not os.path.isdir(checkpoint_folder): os.mkdir(checkpoint_folder) log_name = config['name'] log_dir = os.path.join(checkpoint_folder, log_name) writer = SummaryWriter(logdir=log_dir) print('-' * 20) for key in config: print('%s: %s' % (key, config[key])) print('-' * 20) with open(os.path.join(model_folder, '%s/config.yml' % config['name']), 'w') as f: yaml.dump(config, f) # define loss function (criterion) if config['loss'] == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda() else: criterion = losses.__dict__[config['loss']]().cuda() cudnn.benchmark = True # create model print("=> creating model %s" % config['arch']) model = archs.__dict__[config['arch']](config['num_classes'], config['input_channels'], config['deep_supervision']) if 'False' in config['resume']: config['resume'] = False else: config['resume'] = True resume_flag = False if resume_flag == True: save_path = os.path.join(model_folder, config['name'], 'model.pth') weights = torch.load(save_path) model.load_state_dict(weights) name_yaml = config['name'] with open(os.path.join(model_folder, '%s/config.yml' % name_yaml), 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) #start_epoch = config['epochs'] start_epoch = 0 else: start_epoch = 0 model = model.cuda() if 'effnet' in config['arch']: eff_flag = True else: eff_flag = False if eff_flag == True: cnn_subs = list(model.encoder.eff_conv.children())[1:] #cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] #cnn_params = [item for sublist in cnn_params for item in sublist] summary(model, (config['input_channels'], config['input_w'], config['input_h'])) params = filter(lambda p: p.requires_grad, model.parameters()) if eff_flag == True: params = list(params) + list(model.encoder.conv_a.parameters()) model = torch.nn.DataParallel(model) if config['optimizer'] == 'Adam': optimizer = optim.Adam(params, lr=config['lr'], weight_decay=config['weight_decay']) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(params, lr=config['lr'], momentum=config['momentum'], nesterov=config['nesterov'], weight_decay=config['weight_decay']) else: raise NotImplementedError if eff_flag == True: cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] cnn_params = [item for sublist in cnn_params for item in sublist] cnn_optimizer = torch.optim.Adam(cnn_params, lr=0.001, weight_decay=config['weight_decay']) #cnn_optimizer = None else: cnn_optimizer = None if config['optimizer'] == 'SGD': if config['scheduler'] == 'CosineAnnealingLR': scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=config['epochs'], eta_min=config['min_lr']) elif config['scheduler'] == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, factor=config['factor'], patience=config['patience'], verbose=1, min_lr=config['min_lr']) elif config['scheduler'] == 'MultiStepLR': scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[int(e) for e in config['milestones'].split(',')], gamma=config['gamma']) elif config['scheduler'] == 'ConstantLR': scheduler = None else: raise NotImplementedError else: scheduler = None # Data loading code img_ids = glob( os.path.join(input_folder, config['dataset'], 'images', 'training', '*' + config['img_ext'])) train_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids] #img_dir = os.path.join(input_folder, config['dataset'], 'images', 'training') #mask_dir = os.path.join(input_folder, config['dataset'], 'annotations', 'training') #train_image_mask = image_to_afile(img_dir, mask_dir, None, train_img_ids, config) img_ids = glob( os.path.join(input_folder, config['val_dataset'], 'images', 'validation', '*' + config['img_ext'])) val_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids] img_ids = glob( os.path.join(input_folder, config['val_dataset'], 'images', 'test', '*' + config['img_ext'])) test_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids] mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] train_transform = Compose([ #transforms.RandomScale ([config['scale_min'], config['scale_max']]), #transforms.RandomRotate90(), transforms.Rotate([config['rotate_min'], config['rotate_max']], value=mean, mask_value=0), transforms.Flip(), #transforms.HorizontalFlip (), transforms.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=10, val_shift_limit=10), transforms.RandomBrightnessContrast(brightness_limit=0.10, contrast_limit=0.10, brightness_by_max=True), transforms.Resize(config['input_h'], config['input_w']), transforms.Normalize(mean=mean, std=std), ]) val_transform = Compose([ transforms.Resize(config['input_h'], config['input_w']), transforms.Normalize(mean=mean, std=std), ]) train_dataset = Dataset(img_ids=train_img_ids, img_dir=os.path.join(input_folder, config['dataset'], 'images', 'training'), mask_dir=os.path.join(input_folder, config['dataset'], 'annotations', 'training'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], input_channels=config['input_channels'], transform=train_transform, from_file=None) val_dataset = Dataset(img_ids=val_img_ids, img_dir=os.path.join(input_folder, config['val_dataset'], 'images', 'validation'), mask_dir=os.path.join(input_folder, config['val_dataset'], 'annotations', 'validation'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], input_channels=config['input_channels'], transform=val_transform, from_file=None) test_dataset = Dataset(img_ids=test_img_ids, img_dir=os.path.join(input_folder, config['val_dataset'], 'images', 'test'), mask_dir=os.path.join(input_folder, config['val_dataset'], 'annotations', 'test'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], input_channels=config['input_channels'], transform=val_transform, from_file=None) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=config['num_workers'], drop_last=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, #config['batch_size'], shuffle=False, num_workers=config['num_workers'], drop_last=False) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=1, #config['batch_size'], shuffle=False, num_workers=config['num_workers'], drop_last=False) log = OrderedDict([ ('epoch', []), ('lr', []), ('loss', []), ('iou', []), ('dice', []), ('val_loss', []), ('val_iou', []), ('val_dice', []), ]) best_iou = 0 trigger = 0 Best_dice = 0 iou_AtBestDice = 0 for epoch in range(start_epoch, config['epochs']): print('{:s} Epoch [{:d}/{:d}]'.format(config['arch'], epoch, config['epochs'])) # train for one epoch train_log = train(epoch, config, train_loader, model, criterion, optimizer, cnn_optimizer) if config['optimizer'] == 'SGD': if config['scheduler'] == 'CosineAnnealingLR': scheduler.step() elif config['scheduler'] == 'ReduceLROnPlateau': scheduler.step(val_log['loss']) elif config['scheduler'] == 'MultiStepLR': scheduler.step() # evaluate on validation set val_log = validate(config, val_loader, model, criterion) test_log = validate(config, test_loader, model, criterion) if Best_dice < test_log['dice']: Best_dice = test_log['dice'] iou_AtBestDice = test_log['iou'] print( 'loss %.4f - iou %.4f - dice %.4f - val_loss %.4f - val_iou %.4f - val_dice %.4f - test_iou %.4f - test_dice %.4f - Best_dice %.4f - iou_AtBestDice %.4f' % (train_log['loss'], train_log['iou'], train_log['dice'], val_log['loss'], val_log['iou'], val_log['dice'], test_log['iou'], test_log['dice'], Best_dice, iou_AtBestDice)) save_tensorboard(writer, train_log, val_log, test_log, epoch) log['epoch'].append(epoch) log['lr'].append(config['lr']) log['loss'].append(train_log['loss']) log['iou'].append(train_log['iou']) log['dice'].append(train_log['dice']) log['val_loss'].append(val_log['loss']) log['val_iou'].append(val_log['iou']) log['val_dice'].append(val_log['dice']) pd.DataFrame(log).to_csv(os.path.join(model_folder, '%s/log.csv' % config['name']), index=False) trigger += 1 if val_log['iou'] > best_iou: torch.save( model.state_dict(), os.path.join(model_folder, '%s/model.pth' % config['name'])) best_iou = val_log['iou'] print("=> saved best model") trigger = 0 # early stopping if config['early_stopping'] >= 0 and trigger >= config[ 'early_stopping']: print("=> early stopping") break torch.cuda.empty_cache()
genusnet.to(device) discriminator_family.to(device) familynet.to(device) loss_fn = torch.nn.CrossEntropyLoss() loss_fn2 = ContrastiveLoss() ##quitar loss_fn3 = SpecLoss() ##quitar # ----------------------------------------------------------------------------- ## etapa 1: entrenar g y h print("||||| Stage 1 |||||") optimizer = torch.optim.Adam( list(encoder.parameters()) + list(classifier.parameters()) + list(ssnet.parameters()) + list(genusnet.parameters()) + list(familynet.parameters()), lr=0.0001) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[30], gamma=0.1) #herb std-mean #tensor([0.0808, 0.0895, 0.1141]) #tensor([0.7410, 0.7141, 0.6500]) #photo std-mean #tensor([0.1399, 0.1464, 0.1392]) #tensor([0.2974, 0.3233, 0.2370]) data_transforms = { 'train': transforms.Compose([ #transforms.Resize((img_size, img_size)), transforms.RandomRotation(15), #transforms.RandomCrop((img_size, img_size)), #transforms.RandomResizedCrop((img_size, img_size)),
lr=config['learning_rate'], weight_decay=0, betas=(0.9, 0.999)) init_weights(netG, init_type='kaiming', scale=0.1) global_step = 0 # G pixel loss cri_pix = nn.L1Loss().to(device) # G feature loss cri_fea = nn.L1Loss().to(device) # load VGG perceptual loss netF = VGGFeatureExtractor(feature_layer=34, use_bn=False).to(device) print('# perceptual parameters:', sum(param.numel() for param in netF.parameters())) scheduler = lr_scheduler.MultiStepLR(optimizer_G, [50000, 100000, 200000, 300000], 0.5) log_dict = OrderedDict() netG.train() for epoch in trange(config['number_epochs']): train_bar = tqdm(train_loader) train_bar.set_description_str(desc=f"N epochs - {epoch}") scheduler.step() for step, (lr, hr) in enumerate(train_bar): global_step += 1 lr = torch.autograd.Variable(lr, requires_grad=True).to(device) hr = torch.autograd.Variable(hr, requires_grad=True).to(device) sr = netG(lr)