def train_mlp(args): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') assert args.cae_weight, "No trained cae weight" cae = CAE().to(device) cae.eval() cae.load_state_dict(torch.load(args.cae_weight)) print('a') train_dataset = PathDataSet(S2D_data_path, cae.encoder) val_dataset = PathDataSet(S2D_data_path, cae.encoder, is_val=True) print('b') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) now = datetime.now() output_folder = args.output_folder + '/' + now.strftime( '%Y-%m-%d_%H-%M-%S') check_and_create_dir(output_folder) model = MLP(args.input_size, args.output_size).to(device) if args.load_weights: print("Load weight from {}".format(args.load_weights)) model.load_state_dict(torch.load(args.load_weights)) criterion = nn.MSELoss() # optimizer = torch.optim.Adagrad(model.parameters()) optimizer = AdaBelief(model.parameters(), lr=1e-4, eps=1e-10, betas=(0.9, 0.999), weight_decouple=True, rectify=False) for epoch in range(args.max_epoch): model.train() for i, data in enumerate(tqdm(train_loader)): # get data input_data = data[0].to(device) # B, 32 next_config = data[1].to(device) # B, 2 # predict predict_config = model(input_data) # get loss loss = criterion(predict_config, next_config) # backpropagation optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() neptune.log_metric("batch_loss", loss.item()) print('\ncalculate validation accuracy..') model.eval() with torch.no_grad(): losses = [] for i, data in enumerate(tqdm(val_loader)): # get data input_data = data[0].to(device) # B, 32 next_config = data[1].to(device) # B, 2 # predict predict_config = model(input_data) # get loss loss = criterion(predict_config, next_config) losses.append(loss.item()) val_loss = np.mean(losses) neptune.log_metric("val_loss", val_loss) print("validation result, epoch {}: {}".format(epoch, val_loss)) if epoch % 5 == 0: torch.save(model.state_dict(), '{}/epoch_{}.tar'.format(output_folder, epoch))
def train_loop(folds, fold): if CFG.device == 'GPU': LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.nprocs == 8: xm.master_print(f"========== fold: {fold} training ==========") # ==================================================== # loader # ==================================================== trn_idx = folds[folds['fold'] != fold].index val_idx = folds[folds['fold'] == fold].index train_folds = folds.loc[trn_idx].reset_index(drop=True) valid_folds = folds.loc[val_idx].reset_index(drop=True) train_folds = train_folds[train_folds['StudyInstanceUID'].isin( train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True) valid_labels = valid_folds[CFG.target_cols].values train_dataset = TrainDataset(train_folds, train_annotations, use_annot=True, transform=get_transforms(data='train')) valid_dataset = TrainDataset(valid_folds, train_annotations, use_annot=False, transform=get_transforms(data='valid')) if CFG.device == 'GPU': train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=CFG.num_workers, pin_memory=True, drop_last=False) elif CFG.device == 'TPU': train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG.batch_size, sampler=train_sampler, drop_last=True, num_workers=CFG.num_workers) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, sampler=valid_sampler, drop_last=False, num_workers=CFG.num_workers) # ==================================================== # scheduler # ==================================================== def get_scheduler(optimizer): if CFG.scheduler == 'ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps) elif CFG.scheduler == 'CosineAnnealingLR': scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1) elif CFG.scheduler == 'CosineAnnealingWarmRestarts': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1) return scheduler # ==================================================== # model & optimizer # ==================================================== if CFG.device == 'TPU': device = xm.xla_device() elif CFG.device == 'GPU': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') teacher_model = CustomSeResNet152D(CFG.model_name, pretrained=False) teacher_model.to(device) state = torch.load(CFG.teacher) teacher_model.load_state_dict(state['model']) for param in teacher_model.parameters(): param.requires_grad = False teacher_model.eval() # teacher_model.to(device) model = CustomSeResNet152D_WLF(CFG.model_name, pretrained=True) model.to(device) # state = torch.load(CFG.student) # model.load_state_dict(state['model']) optimizer = AdaBelief(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay) scheduler = get_scheduler(optimizer) # ==================================================== # loop # ==================================================== train_criterion = CustomLoss(weights=CFG.weights) valid_criterion = nn.BCEWithLogitsLoss() best_score = 0. best_loss = np.inf for epoch in range(CFG.epochs): start_time = time.time() # train if CFG.device == 'TPU': if CFG.nprocs == 1: avg_loss = train_fn(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) elif CFG.nprocs == 8: para_train_loader = pl.ParallelLoader(train_loader, [device]) avg_loss = train_fn( para_train_loader.per_device_loader(device), teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) elif CFG.device == 'GPU': avg_loss = train_fn(train_loader, teacher_model, model, train_criterion, optimizer, epoch, scheduler, device) # eval if CFG.device == 'TPU': if CFG.nprocs == 1: avg_val_loss, preds, _ = valid_fn(valid_loader, model, valid_criterion, device) elif CFG.nprocs == 8: para_valid_loader = pl.ParallelLoader(valid_loader, [device]) avg_val_loss, preds, valid_labels = valid_fn( para_valid_loader.per_device_loader(device), model, valid_criterion, device) preds = idist.all_gather(torch.tensor(preds)).to('cpu').numpy() valid_labels = idist.all_gather( torch.tensor(valid_labels)).to('cpu').numpy() elif CFG.device == 'GPU': avg_val_loss, preds, _ = valid_fn(valid_loader, model, valid_criterion, device) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(avg_val_loss) elif isinstance(scheduler, CosineAnnealingLR): scheduler.step() elif isinstance(scheduler, CosineAnnealingWarmRestarts): scheduler.step() # scoring score, scores = get_score(valid_labels, preds) elapsed = time.time() - start_time if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) xm.master_print( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) if score > best_score: best_score = score if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) torch.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) xm.save({ 'model': model, 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') if avg_val_loss < best_loss: best_loss = avg_val_loss if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') torch.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model' ) xm.save({ 'model': model, 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth') # # inference用に全て保存しておく # if CFG.device == 'TPU': # xm.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth') # elif CFG.device == 'GPU': # torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth') if CFG.nprocs != 8: check_point = torch.load( OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') for c in [f'pred_{c}' for c in CFG.target_cols]: valid_folds[c] = np.nan valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds'] return valid_folds
def __init__(self, *, latent_dim, image_size, optimizer="adam", fmap_max=512, fmap_inverse_coef=12, transparent=False, greyscale=False, disc_output_size=5, attn_res_layers=[], sle_spatial=False, ttur_mult=1., lr=2e-4, rank=0, ddp=False): super().__init__() self.latent_dim = latent_dim self.image_size = image_size G_kwargs = dict(image_size=image_size, latent_dim=latent_dim, fmap_max=fmap_max, fmap_inverse_coef=fmap_inverse_coef, transparent=transparent, greyscale=greyscale, attn_res_layers=attn_res_layers, use_sle_spatial=sle_spatial) self.G = Generator(**G_kwargs) self.D = Discriminator(image_size=image_size, fmap_max=fmap_max, fmap_inverse_coef=fmap_inverse_coef, transparent=transparent, greyscale=greyscale, attn_res_layers=attn_res_layers, disc_output_size=disc_output_size) self.ema_updater = EMA(0.995) self.GE = Generator(**G_kwargs) set_requires_grad(self.GE, False) if optimizer == "adam": self.G_opt = Adam(self.G.parameters(), lr=lr, betas=(0.5, 0.9)) self.D_opt = Adam(self.D.parameters(), lr=lr * ttur_mult, betas=(0.5, 0.9)) elif optimizer == "adabelief": self.G_opt = AdaBelief(self.G.parameters(), lr=lr, betas=(0.5, 0.9)) self.D_opt = AdaBelief(self.D.parameters(), lr=lr * ttur_mult, betas=(0.5, 0.9)) else: assert False, "No valid optimizer is given" self.apply(self._init_weights) self.reset_parameter_averaging() self.cuda(rank) self.D_aug = AugWrapper(self.D, image_size)
def get_optimizer(model, optimizer_name, optimizer_params, scheduler_name, scheduler_params, n_epochs): opt_lower = optimizer_name.lower() opt_look_ahed = optimizer_params["lookahead"] if opt_lower == 'sgd': optimizer = optim.SGD(model.parameters(), lr=optimizer_params["lr"], momentum=optimizer_params["momentum"], weight_decay=optimizer_params["weight_decay"], nesterov=True) elif opt_lower == 'adam': optimizer = optim.Adam(model.parameters(), lr=optimizer_params["lr"], betas=(0.9, 0.999), eps=1e-08, weight_decay=0) elif opt_lower == 'adamw': optimizer = torch.optim.AdamW( model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"], eps=optimizer_params["opt_eps"]) elif opt_lower == 'nadam': optimizer = torch.optim.Nadam( model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"], eps=optimizer_params["opt_eps"]) elif opt_lower == 'radam': optimizer = RAdam(model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"], eps=optimizer_params["opt_eps"]) elif opt_lower == "adabelief": optimizer = AdaBelief(model.parameters(), lr=optimizer_params["lr"], eps=1e-8, weight_decay=optimizer_params["weight_decay"]) elif opt_lower == "adamp": optimizer = AdamP(model.parameters(), lr=optimizer_params["lr"], weight_decay=optimizer_params["weight_decay"]) else: assert False and "Invalid optimizer" raise ValueError if opt_look_ahed: optimizer = Lookahead(optimizer, alpha=0.5, k=5) if scheduler_name == "CosineAnnealingWarmRestarts": scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, eta_min=scheduler_params["eta_min"], T_0=scheduler_params["T_0"], T_mult=scheduler_params["T_multi"], ) elif scheduler_name == "WarmRestart": scheduler = WarmRestart(optimizer, T_max=scheduler_params["T_max"], T_mult=scheduler_params["T_mul"], eta_min=scheduler_params["eta_min"]) elif scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=scheduler_params["schedule"], gamma=scheduler_params["gamma"]) if scheduler_params["warmup_factor"] > 0: scheduler = GradualWarmupSchedulerV2( optimizer, multiplier=scheduler_params["warmup_factor"], total_epoch=1, after_scheduler=scheduler) return optimizer, scheduler
def main(): """Model training.""" train_speakers, valid_speakers = get_valid_speakers() # define transforms for train & validation samples train_transform = Compose([Resize(760, 80), ToTensor()]) # define datasets & loaders train_dataset = TrainDataset('train', train_speakers, transform=train_transform) valid_dataset = TrainDataset('train', valid_speakers, transform=train_transform) train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=256, shuffle=False) device = get_device() print(f'Selected device: {device}') model = torch.hub.load('huawei-noah/ghostnet', 'ghostnet_1x', pretrained=True) model.classifier = nn.Linear(in_features=1280, out_features=1, bias=True) net = model net.to(device) criterion = nn.BCEWithLogitsLoss() optimizer = AdaBelief(net.parameters(), lr=1e-3) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=3, eps=1e-4, verbose=True) # prepare valid target yvalid = get_valid_targets(valid_dataset) # training loop for epoch in range(10): loss_log = {'train': [], 'valid': []} train_loss = [] net.train() for x, y in tqdm(train_loader): x, y = mixup(x, y, alpha=0.2) x, y = x.to(device), y.to(device, dtype=torch.float32) optimizer.zero_grad() outputs = net(x) loss = criterion(outputs, y.unsqueeze(1)) loss.backward() optimizer.step() # save loss train_loss.append(loss.item()) # evaluate net.eval() valid_pred = torch.Tensor([]).to(device) for x, y in valid_loader: with torch.no_grad(): x, y = x.to(device), y.to(device, dtype=torch.float32) ypred = net(x) valid_pred = torch.cat([valid_pred, ypred], 0) valid_pred = sigmoid(valid_pred.cpu().numpy()) val_loss = log_loss(yvalid, valid_pred, eps=1e-7) val_acc = (yvalid == (valid_pred > 0.5).astype(int).flatten()).mean() tqdm.write( f'Epoch {epoch} train_loss={np.mean(train_loss):.4f}; val_loss={val_loss:.4f}; val_acc={val_acc:.4f}' ) loss_log['train'].append(np.mean(train_loss)) loss_log['valid'].append(val_loss) scheduler.step(loss_log['valid'][-1]) torch.save(net.state_dict(), 'ghostnet_model.pt') print('Training is complete.')
# make environment env = gym.make(ENV_NAME) # Setup policy, optimizer and criterion hid_size = 256 n_layers = 4 ac_kwargs = dict(hidden_sizes=[hid_size] * n_layers) clone_pi = GaussianActor(env.observation_space.shape[0], env.action_space.shape[0], activation=nn.LeakyReLU, **ac_kwargs) distilled_clone_pi = DistilledGaussianActor(env.observation_space.shape[0], env.action_space.shape[0], activation=nn.LeakyReLU, n_experts=2, **ac_kwargs) # Optimizer and criterion for ordinary clone pi_optimizer = AdaBelief(clone_pi.parameters(), betas=(0.9, 0.999), eps=1e-16) criterion = nn.MSELoss() # Optimizer and criterion for distilled clone distilled_pi_optimizer = AdaBelief(distilled_clone_pi.parameters(), betas=(0.9, 0.999), eps=1e-16) distilled_criterion = nn.MSELoss() #################################################################################### # Create dual clone config_name_list = ['marigold', 'rose'] marigold_clone_distill = DistillBehavioralClone(config_name_list=config_name_list, config_name='marigold',