def test_loss_fixed_size(batch): device = torch.device('cpu') loss_func_mean = losses.get_criterion(device, 'mean') loss_func_mean_of_mean = losses.get_criterion(device, 'mean_of_mean') loss_mean = loss_func_mean(batch) loss_mean_of_mean = loss_func_mean_of_mean(batch) assert loss_mean.size() == loss_mean_of_mean.size() assert torch.allclose(loss_mean, loss_mean_of_mean), loss_mean - loss_mean_of_mean
def test_hierarchy(batch): correct, total = metrics.accuracy_max(batch) acc = correct / total loss_func = losses.get_criterion(DEVICE, 'mean') if OPT_SCALE: res = scipy.optimize.minimize_scalar( lambda x: loss_func(torch.mul(batch, x)), bracket=(1e-1, 1e2)) scale = res.x if scale <= 0: raise RuntimeError( "Something went wrong during the optimization process") else: scale = 216 loss = loss_func(torch.mul(batch, scale)) assert loss >= (1 - acc) * math.log(2)
def __init__(self, args, **kwargs): super(DecTrainer, self).__init__(args, **kwargs) # dataloader self.trainloader = get_dataloader(args, cfg, 'train') # self.trainloader_val = get_dataloader(args, cfg, 'train_voc') self.valloader = get_dataloader(args, cfg, 'val') self.denorm = self.trainloader.dataset.denorm self.use_triplet = args.use_triplet self.loss_3d = args.loss_3d self.normalize_feature = args.normalize_feature self.nclass = get_num_classes(args) self.classNames = get_class_names(args) assert self.nclass == len(self.classNames) - 1 self.classIndex = {} for i, cname in enumerate(self.classNames): self.classIndex[cname] = i # model self.enc = get_model(cfg.NET, num_classes=self.nclass) self.criterion_cls = get_criterion(cfg.NET.LOSS) # optimizer using different LR enc_params = self.enc.parameter_groups(cfg.NET.LR, cfg.NET.WEIGHT_DECAY) self.optim_enc = self.get_optim(enc_params, cfg.NET) # checkpoint management self._define_checkpoint('enc', self.enc, self.optim_enc) self._load_checkpoint(args.resume) self.fixed_batch = None self.fixed_batch_path = args.fixed_batch_path if os.path.isfile(self.fixed_batch_path): print("Loading fixed batch from {}".format(self.fixed_batch_path)) self.fixed_batch = torch.load(self.fixed_batch_path) # using cuda if cfg.NUM_GPUS != 0: self.enc = nn.DataParallel(self.enc) self.criterion_cls = nn.DataParallel(self.criterion_cls) self.enc = self.enc.cuda() self.criterion_cls = self.criterion_cls.cuda() # CHANGE: visual self.visual_times = 0 self.dataset = args.dataset.lower()
def __init__(self, backbone: nn.Module, criterion=None, device: str = 'cpu', num_classes: int = 30, alpha: float = .2): super(FacialKeypointsDetector, self).__init__() self.alpha = alpha self.backbone = backbone self.pool = nn.AdaptiveAvgPool2d(output_size=1) self.head = DoubleStageRegressor(backbone.out_features, num_classes) self.criterion = criterion self.rmse = get_criterion("RMSE") self.device = device self.name = f"fkd_{backbone.name}" self.to(device)
def __init__(self, args, **kwargs): super(DecTrainer, self).__init__(args, **kwargs) # dataloader self.trainloader = get_dataloader(args, cfg, 'train') self.trainloader_val = get_dataloader(args, cfg, 'train_voc') self.valloader = get_dataloader(args, cfg, 'val') self.denorm = self.trainloader.dataset.denorm self.nclass = get_num_classes(args) self.classNames = get_class_names(args)[:-1] assert self.nclass == len(self.classNames) self.classIndex = {} for i, cname in enumerate(self.classNames): self.classIndex[cname] = i # model self.enc = get_model(cfg.GENERATOR, num_classes=self.nclass) self.criterion_cls = get_criterion(cfg.GENERATOR.LOSS) print(self.enc) # optimizer using different LR enc_params = self.enc.parameter_groups(cfg.GENERATOR.LR, cfg.GENERATOR.WEIGHT_DECAY) self.optim_enc = self.get_optim(enc_params, cfg.GENERATOR) # checkpoint management self._define_checkpoint('enc', self.enc, self.optim_enc) self._load_checkpoint(args.resume) self.fixed_batch = None self.fixed_batch_path = args.fixed_batch_path if os.path.isfile(self.fixed_batch_path): print("Loading fixed batch from {}".format(self.fixed_batch_path)) self.fixed_batch = torch.load(self.fixed_batch_path) # using cuda self.enc = nn.DataParallel(self.enc).cuda() self.criterion_cls = nn.DataParallel(self.criterion_cls).cuda()
TEST_FUNCS, ids=lambda func_data: func_data[1]) def test_torch_func(tensor_list, func_data): """ Test torch function """ func, _ = func_data masked_tensor = maskedtensor.from_list(tensor_list, dims=(0, 1)) res_mt = list(func(masked_tensor)) res_lst = apply_list_tensors(tensor_list, func) for t_mt, t_lst in zip(res_mt, res_lst): assert t_mt.size() == t_lst.size() assert torch.allclose(t_mt, t_lst, atol=ATOL), torch.norm(t_mt - t_lst, p=float('inf')) TEST_SCORE_FUNCS = [(get_criterion(DEVICE, 'mean'), 'loss')] @pytest.mark.parametrize('func_data', TEST_SCORE_FUNCS, ids=lambda func_data: func_data[1]) def test_score_func(score_list, func_data): """ Test score function """ func, _ = func_data masked_tensor = maskedtensor.from_list(score_list, dims=(0, 1)) res_mt = func(masked_tensor) res_lst = func(torch.stack(score_list)) assert torch.allclose(res_mt, res_lst, atol=ATOL), torch.norm(res_mt - res_lst, p=float('inf'))
def main(): # args = parse_args() IMAGE_PATH = 'data/images/' num_classes_1 = 168 num_classes_2 = 11 num_classes_3 = 7 stats = (0.0692, 0.2051) train_df = pd.read_csv('data/train_with_folds.csv') # train_df = train_df.set_index(['image_id']) # train_df = train_df.drop(['grapheme'], axis=1) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) # Data Loaders # df_train, df_val = train_test_split(train_df, test_size=0.2, random_state=2021) # train_transform = get_transform(128) train_transform = A.Compose([ A.CoarseDropout(max_holes=1, max_width=64, max_height=64, p=0.9), A.ShiftScaleRotate(rotate_limit=5, p=0.9), A.Normalize(mean=stats[0], std=stats[1], always_apply=True) ]) val_transform = A.Compose( [A.Normalize(mean=stats[0], std=stats[1], always_apply=True)]) BATCH_SIZE = 50 folds = [{ 'train': [1, 2, 3, 4], 'val': [0] }, { 'train': [0, 2, 3, 4], 'val': [1] }, { 'train': [1, 0, 3, 4], 'val': [2] }, { 'train': [1, 2, 0, 4], 'val': [3] }, { 'train': [1, 2, 3, 0], 'val': [4] }] # Loop over folds for fld in range(1): fld = 4 print(f'Train fold: {fld}') train_loader = get_loader(train_df, IMAGE_PATH, folds=folds[fld]['train'], batch_size=BATCH_SIZE, workers=4, shuffle=True, transform=train_transform) val_loader = get_loader(train_df, IMAGE_PATH, folds=folds[fld]['val'], batch_size=BATCH_SIZE, workers=4, shuffle=False, transform=val_transform) # Build Model model = load_model('seresnext50_32x4d', pretrained=True) model = model.cuda() # Optimizer optimizer = get_optimizer(model, lr=.00016) # Loss criterion1 = get_criterion() # Training history = pd.DataFrame() history2 = pd.DataFrame() torch.cuda.empty_cache() gc.collect() best = 0 best2 = 1e10 n_epochs = 100 early_epoch = 0 # Scheduler scheduler = get_scheduler(optimizer, train_loader=train_loader, epochs=n_epochs) # print('Loading previous training...') # state = torch.load('model.pth') # model.load_state_dict(state['model_state']) # best = state['kaggle'] # best2 = state['loss'] # print(f'Loaded model with kaggle score: {best}, loss: {best2}') # optimizer.load_state_dict(state['opt_state']) # scheduler.load_state_dict(state['scheduler_state']) # early_epoch = state['epoch'] + 1 # print(f'Beginning at epoch {early_epoch}') # print('') for epoch in range(n_epochs - early_epoch): epoch += early_epoch torch.cuda.empty_cache() gc.collect() # ################################################################### # ############## TRAINING ########################################### # ################################################################### model.train() total_loss = 0 total_loss_1 = 0 total_loss_2 = 0 total_loss_3 = 0 # ratio = pow(.5,epoch/50) # ratio = 0.7 ratio = 1.0 t = tqdm(train_loader) for batch_idx, (img_batch, y_batch) in enumerate(t): img_batch = img_batch.cuda().float() y_batch = y_batch.cuda().long() optimizer.zero_grad() label1 = y_batch[:, 0] label2 = y_batch[:, 1] label3 = y_batch[:, 2] rand = np.random.rand() if rand < 0.5: images, targets = mixup(img_batch, label1, label2, label3, 0.4) output1, output2, output3 = model(images) l1, l2, l3 = mixup_criterion(output1, output2, output3, targets, rate=ratio) elif rand < 1: images, targets = cutmix(img_batch, label1, label2, label3, 0.4) output1, output2, output3 = model(images) l1, l2, l3 = cutmix_criterion(output1, output2, output3, targets, rate=ratio) # else: # output1,output2,output3 = model(img_batch) # l1, l2, l3 = criterion1(output1,output2,output3, y_batch) loss = l1 * .4 + l2 * .3 + l3 * .3 total_loss += loss total_loss_1 += l1 * .4 total_loss_2 += l2 * .3 total_loss_3 += l3 * .3 t.set_description( f'Epoch {epoch+1}/{n_epochs}, LR: %6f, Ratio: %.4f, Loss: %.4f, Root loss: %.4f, Vowel loss: %.4f, Consonant loss: %.4f' % (optimizer.state_dict()['param_groups'][0]['lr'], ratio, total_loss / (batch_idx + 1), total_loss_1 / (batch_idx + 1), total_loss_2 / (batch_idx + 1), total_loss_3 / (batch_idx + 1))) # t.set_description(f'Epoch {epoch}/{n_epochs}, LR: %6f, Loss: %.4f'%(optimizer.state_dict()['param_groups'][0]['lr'],total_loss/(batch_idx+1))) if history is not None: history.loc[epoch + batch_idx / len(train_loader), 'train_loss'] = loss.data.cpu().numpy() history.loc[ epoch + batch_idx / len(train_loader), 'lr'] = optimizer.state_dict()['param_groups'][0]['lr'] loss.backward() optimizer.step() # if scheduler is not None: # scheduler.step() # ################################################################### # ############## VALIDATION ######################################### # ################################################################### model.eval() loss = 0 preds_1 = [] preds_2 = [] preds_3 = [] tars_1 = [] tars_2 = [] tars_3 = [] with torch.no_grad(): for img_batch, y_batch in val_loader: img_batch = img_batch.cuda().float() y_batch = y_batch.cuda().long() o1, o2, o3 = model(img_batch) l1, l2, l3 = criterion1(o1, o2, o3, y_batch) loss += l1 * .4 + l2 * .3 + l3 * .3 for j in range(len(o1)): preds_1.append(torch.argmax(F.softmax(o1[j]), -1)) preds_2.append(torch.argmax(F.softmax(o2[j]), -1)) preds_3.append(torch.argmax(F.softmax(o3[j]), -1)) for i in y_batch: tars_1.append(i[0].data.cpu().numpy()) tars_2.append(i[1].data.cpu().numpy()) tars_3.append(i[2].data.cpu().numpy()) preds_1 = [p.data.cpu().numpy() for p in preds_1] preds_2 = [p.data.cpu().numpy() for p in preds_2] preds_3 = [p.data.cpu().numpy() for p in preds_3] preds_1 = np.array(preds_1).T.reshape(-1) preds_2 = np.array(preds_2).T.reshape(-1) preds_3 = np.array(preds_3).T.reshape(-1) scores = [] scores.append( sklearn.metrics.recall_score(tars_1, preds_1, average='macro')) scores.append( sklearn.metrics.recall_score(tars_2, preds_2, average='macro')) scores.append( sklearn.metrics.recall_score(tars_3, preds_3, average='macro')) final_score = np.average(scores, weights=[2, 1, 1]) loss /= len(val_loader) if history2 is not None: history2.loc[epoch, 'val_loss'] = loss.cpu().numpy() history2.loc[epoch, 'acc'] = final_score history2.loc[epoch, 'root_acc'] = scores[0] history2.loc[epoch, 'vowel_acc'] = scores[1] history2.loc[epoch, 'consonant_acc'] = scores[2] if scheduler is not None: scheduler.step(final_score) print( f'Dev loss: %.4f, Kaggle: {final_score}, Root acc: {scores[0]}, Vowel acc: {scores[1]}, Consonant acc: {scores[2]}' % (loss)) if epoch > 0: history2['acc'].plot() plt.savefig(f'epoch%03d_{fld}_acc.png' % (epoch + 1)) plt.clf() if loss < best2: best2 = loss print(f'Saving best model... (loss)') torch.save( { 'epoch': epoch, 'loss': loss, 'kaggle': final_score, 'model_state': model.state_dict(), 'opt_state': optimizer.state_dict(), 'scheduler_state': scheduler.state_dict() }, f'model-1_{fld}.pth') if final_score > best: best = final_score print(f'Saving best model... (acc)') torch.save( { 'epoch': epoch, 'loss': loss, 'kaggle': final_score, 'model_state': model.state_dict(), 'opt_state': optimizer.state_dict(), 'scheduler_state': scheduler.state_dict() }, f'model_{fld}.pth')
def main(**kwargs): training_path = kwargs.get('training_data_path') checkpoint_path = kwargs.get('checkpoint_path') tensorboard_log_dir = kwargs.get('tensorboard_log_dir') if not os.path.isdir(checkpoint_path): os.mkdir(checkpoint_path) backbone_name = kwargs.get('backbone') criterion_name = kwargs.get('criterion').upper() optimizer_name = kwargs.get('optimizer').upper() scheduler = kwargs.get('scheduler',None) pretrained = kwargs.get('pretrained') num_classes = kwargs.get('num_classes') device = kwargs.get('device') batch_size = kwargs.get('batch_size') epochs = kwargs.get('epochs') hyperparameters = kwargs.get('hyperparameters',{}) augmentations = kwargs.get('augmentations',{}) verbose = kwargs.get('verbose') train_split = kwargs.get('train_split') nfolds = kwargs.get('nfolds') val_splits = [(1-train_split) / nfolds] * nfolds resume = kwargs.get('resume') only_weights = kwargs.get('only_weights') seed = hyperparameters.get('seed') random_jitter = augmentations.get('jitter',{}) random_horizontal_flip = augmentations.get('horizontal_flip', 0.5) random_rotation = augmentations.get('rotation', 20) writer = SummaryWriter(log_dir=tensorboard_log_dir, flush_secs=20) if seed: seed_everything(seed) # TODO calculate mean and std mean = hyperparameters.get('mean',0) std = hyperparameters.get('std',1) splits = [train_split]+val_splits assert sum(splits) <= 1,"given splits must be lower or equal than 1" original_img_size = 96 criterion = get_criterion(criterion_name) backbone = get_backbone(backbone_name, pretrained=pretrained) model = FacialKeypointsDetector(backbone, criterion=criterion, device=device, num_classes=num_classes) optimizer = get_optimizer(optimizer_name, model.parameters(), kwargs=hyperparameters.get('optimizer',{})) scaler = GradScaler() val_transforms = None val_target_transform = TargetTransform(original_img_size) train_transform = train_target_transform = None train_transforms = transforms.TrainTransforms(model.get_input_size(), original_img_size, mean=mean, std=std, brightness=random_jitter.get('brightness'), contrast=random_jitter.get('contrast'), saturation=random_jitter.get('saturation'), hue=random_jitter.get('hue'), rotation_degree=random_rotation, hflip=random_horizontal_flip) val_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize(model.get_input_size()), transforms.ToTensor(), transforms.Normalize(mean,std)]) train_dataset,*val_datasets = get_training_datasets(root_path=training_path, train_transforms=(train_transforms,train_transform,train_target_transform), val_transforms=(val_transforms,val_transform,val_target_transform), split_ratios=splits) val_dls = [] train_dl = torch.utils.data.DataLoader(train_dataset, num_workers=4, batch_size=batch_size, pin_memory=True, collate_fn=custom_collate_fn, shuffle=True) for val_ds in val_datasets: val_dls.append( torch.utils.data.DataLoader( val_ds, batch_size=batch_size, num_workers=2) ) current_epoch = 0 best_loss = math.inf if resume: print(Fore.CYAN, f"loading checkpoint from {checkpoint_path}",Style.RESET_ALL) best_loss,current_epoch = load_checkpoint(model, optimizer, scheduler=scheduler, save_path=checkpoint_path, suffix='last', only_weights=only_weights) try: for epoch in range(current_epoch,epochs): training_loop(train_dl, model, epoch, epochs, optimizer, writer,scaler, scheduler=scheduler, verbose=verbose) val_losses = [] for i,val_dl in enumerate(val_dls): val_loss = validation_loop(val_dl, model) val_losses.append(val_loss) print(Fore.LIGHTBLUE_EX, f"validation [{i+1}] loss: {val_loss:.07f}",Style.RESET_ALL) writer.add_scalar(f'Loss/val_{i+1}', val_loss, epoch) mean_val_loss = sum(val_losses) / len(val_losses) print(Fore.LIGHTBLUE_EX, f"validation [mean] loss: {mean_val_loss:.07f}",Style.RESET_ALL) writer.add_scalar(f'Loss/val_mean', mean_val_loss, epoch) writer.flush() if mean_val_loss < best_loss: best_loss = mean_val_loss print(Fore.CYAN, "saving best checkpoint...",Style.RESET_ALL) save_checkpoint(model,optimizer,epoch,best_loss, scheduler=scheduler, suffix='best', save_path=checkpoint_path) print(Fore.CYAN, "saving last checkpoint...",Style.RESET_ALL) save_checkpoint(model,optimizer,epoch,best_loss, scheduler=scheduler, suffix='last', save_path=checkpoint_path) except KeyboardInterrupt: print(Fore.RED, "training interrupted with ctrl+c saving current state of the model",Style.RESET_ALL) save_checkpoint(model,optimizer,epoch,best_loss, scheduler=scheduler, suffix='last', save_path=checkpoint_path) writer.close()