def get_optimizer( name: str, model_params: Iterable, lr: float = 1e-3, wd: float = 0, lookahead: bool = False, ): if name == "adam": base_optimizer = optim.Adam(model_params, lr=lr, weight_decay=wd) elif name == "sgd": base_optimizer = optim.SGD(model_params, lr=lr, weight_decay=wd, momentum=0.9, nesterov=True) elif name == "radam": base_optimizer = RAdam(model_params, lr=lr, weight_decay=wd) elif name == "ralamb": base_optimizer = Ralamb(model_params, lr=lr, weight_decay=wd) elif name == "adabelief": base_optimizer = AdaBelief(model_params, lr=lr, weight_decay=wd) else: raise ValueError # Use lookahead if lookahead: optimizer = Lookahead(base_optimizer) else: optimizer = base_optimizer return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay layerwise_params = { "encoder*": dict(lr=args.lr * args.ev, weight_decay=args.weight_decay) } parameters = process_model_params(model, layerwise_params=layerwise_params) opt_args = dict(lr=args.lr, weight_decay=weight_decay) if hasattr(args, 'opt_eps') and args.opt_eps is not None: opt_args['eps'] = args.opt_eps if hasattr(args, 'opt_betas') and args.opt_betas is not None: opt_args['betas'] = args.opt_betas opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': opt_args.pop('eps', None) optimizer = SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': opt_args.pop('eps', None) optimizer = SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'adam': optimizer = Adam(parameters, **opt_args) elif opt_lower == 'adamw': optimizer = AdamW(parameters, **opt_args) elif opt_lower == 'radam': optimizer = RAdam(parameters, **opt_args) elif opt_lower == 'radam': optimizer = RAdam(parameters, **opt_args) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': _logger.info('Using lookahead') optimizer = Lookahead(optimizer) return optimizer
]) ), batch_size=1, shuffle=False, num_workers=args.nw) print(len(train_data)) print(len(val_data)) SEED = 2020 utils.set_global_seed(SEED) utils.prepare_cudnn(deterministic=True) loaders = {'train': train_data, 'valid': val_data} criterion = nn.CrossEntropyLoss() model = ENet('efficientnet-b0') print(model) optimizer = Lookahead(RAdam( model.parameters(), lr=args.lr, weight_decay=args.wd)) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.25, patience=3) num_epochs = args.e logdir = "./logs/effnet-b0" fp16_params = None # dict(opt_level="O1") runner = SupervisedRunner(device='cuda') runner.train( model=model, criterion=criterion, scheduler=scheduler, optimizer=optimizer, loaders=loaders, callbacks=[
{ 'params': model.decoder.parameters(), 'lr': learning_rate }, { 'params': model.encoder.parameters(), 'lr': 1e-4 }, { 'params': model.segmentation_head.parameters(), 'lr': learning_rate }, ], weight_decay=0.0003) base_optimizer = RAdam(model.parameters(), weight_decay=0.0003) optimizer = Lookahead(base_optimizer) criterion = { "dice": DiceLoss(), "iou": IoULoss(), "bce": BCEWithLogitsLoss() # FocalLossBinary() } runner = SupervisedRunner(device='cuda', input_key="image", input_target_key="mask") scheduler = OneCycleLR(optimizer, max_lr=0.0016, steps_per_epoch=1, epochs=num_epochs) # scheduler = OneCycleLRWithWarmup( # optimizer, # num_steps=num_epochs,
def smart_way(): args = parse_arguments() SEED = args.seed ROOT = Path(args.dataset) img_paths, targets = retrieve_dataset(ROOT) train_transforms = compose( [resize_transforms(), hard_transforms(), post_transforms()]) valid_transforms = compose([pre_transforms(), post_transforms()]) loaders = get_loaders( img_paths=img_paths, targets=targets, random_state=SEED, batch_size=8, train_transforms_fn=train_transforms, valid_transforms_fn=valid_transforms, ) logdir = './table_recognition/nn/regression/logs6/' model = torch.load( f'./table_recognition/nn/segmentation/logs/resnet18_PSPNet/save/best_model.pth' ) model: RegressionFromSegmentation = RegressionFromSegmentation(model) model.to(utils.get_device()) learning_rate = 0.001 encoder_learning_rate = 0.0005 layerwise_params = { "encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003) } model_params = utils.process_model_params( model, layerwise_params=layerwise_params) base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003) optimizer = Lookahead(base_optimizer) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) device = utils.get_device() runner = CustomRunner2(device=device) runner.train(model=model, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=1000, verbose=True, load_best_on_end=True, main_metric='loss') best_model_save_dir = os.path.join(logdir, 'save') os.makedirs(best_model_save_dir, exist_ok=True) torch.save(model, os.path.join( best_model_save_dir, 'best_model.pth')) # save best model (by valid loss) batch = next(iter(loaders["valid"])) try: runner.trace( model=model, batch=batch, logdir=logdir, fp16=False) # optimized version (not all models can be traced) except Exception: pass
def main(): train_dataset = dataset.SentimentDataset( texts=df_train['sentences'].values.tolist(), labels=df_train['labels'].values, max_seq_length=config.MAX_SEQ_LENGTH, model_name=config.MODEL_NAME) valid_dataset = dataset.SentimentDataset( texts=df_valid['sentences'].values.tolist(), labels=df_valid['labels'].values, max_seq_length=config.MAX_SEQ_LENGTH, model_name=config.MODEL_NAME) train_val_loaders = { "train": DataLoader(dataset=train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True), "valid": DataLoader(dataset=valid_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True) } dBert = model.DistilBert() param_optim = list(dBert.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] criterion = nn.CrossEntropyLoss() base_optimizer = RAdam([{ 'params': [p for n, p in param_optim if not any(nd in n for nd in no_decay)], 'weight_decay': config.WEIGHT_DECAY }, { 'params': [p for n, p in param_optim if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }]) optimizer = Lookahead(base_optimizer) scheduler = OneCycleLRWithWarmup( optimizer, num_steps=config.NUM_EPOCHS, lr_range=(config.LEARNING_RATE, 1e-8), init_lr=config.LEARNING_RATE, warmup_steps=0, ) runner = SupervisedRunner(input_key=("input_ids", "attention_mask")) # model training runner.train(model=dBert, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=train_val_loaders, callbacks=[ AccuracyCallback(num_classes=2), OptimizerCallback(accumulation_steps=config.ACCUM_STEPS), ], fp16=config.FP_16, logdir=config.LOG_DIR, num_epochs=config.NUM_EPOCHS, verbose=True)
def train(config, config_save_name): # reproducibility seed = config.get("_SEED", 42) random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False model_config = config["_MODEL_CONFIG"] train_dataloader_config = config["_TRAIN_DATALOADER_CONFIG"] val_dataloader_config = config["_VAL_DATALOADER_CONFIG"] loss_metric_config = config["_LOSSES_METRICS_CONFIG"] experiment_dir = config["_EXPERIMENT_DIR"] checkpoints_dir = os.path.join(experiment_dir, CHECKPOINTS) results_dir = os.path.join(experiment_dir, RESULTS) tb_logs_dir_train = os.path.join(experiment_dir, TB_LOGS, "train") tb_logs_dir_val = os.path.join(experiment_dir, TB_LOGS, "val") config_out = os.path.join(experiment_dir, config_save_name) saved_checkpoint = config["_MODEL_CHECKPOINT"] checkpoint_format = config["_NEW_CKP_FORMAT"] loss_key = config["_OPTIMIZATION_LOSS"] optim_config = config["_OPTIMIZER"] lookahead_config = config["_LOOKAHEAD_OPTIM"] lr_scheduler_config = config["_LR_SCHEDULER"] experiment_data = config["_EXPERIMENT_DATA"] val_plotting_dict = config.get("_VAL_PLOTTING") model = get_object_instance(model_config)() global_step = 0 if saved_checkpoint is not None: global_step = load_model_data(saved_checkpoint, model, new_format=checkpoint_format) train_loader = get_object_instance(train_dataloader_config)() val_loader = get_object_instance(val_dataloader_config)() print("Train dataset length: {}".format(len(train_loader.dataset))) print("Validation dataset length: {}".format(len(val_loader.dataset))) print("Valiation dataset patients:\n{}".format( val_loader.dataset.patients)) loss_metric = get_object_instance(loss_metric_config)() optimizer_getter = get_object_instance(optim_config) lr_scheduler_getter = get_object_instance(lr_scheduler_config) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") os.makedirs(checkpoints_dir) os.makedirs(results_dir) os.makedirs(tb_logs_dir_train) os.makedirs(tb_logs_dir_val) with open(config_out, "w") as f: yaml.dump(config, f, default_flow_style=False) # create configs for val and test val_config, val_out_dir = create_config(config, "val") test_config, test_out_dir = create_config(config, "test") os.makedirs(val_out_dir) os.makedirs(test_out_dir) val_path = os.path.join(val_out_dir, "val.yaml") print("Creating evaluation config for val: {}".format(val_path)) with open(val_path, "w") as f: yaml.dump(val_config, f, default_flow_style=False) test_path = os.path.join(test_out_dir, "test.yaml") print("Creating evaluation config for test: {}".format(test_path)) with open(test_path, "w") as f: yaml.dump(test_config, f, default_flow_style=False) train_writer = SummaryWriter(tb_logs_dir_train) val_writer = SummaryWriter(tb_logs_dir_val) model_params = model.parameters() if config.get("_MODEL_PARAM_PREP") is not None: model_prep = get_object_instance(config.get("_MODEL_PARAM_PREP")) model_params = model_prep(model) optimizer = optimizer_getter(model_params) if lookahead_config["use_lookahead"]: optimizer = Lookahead(optimizer, **lookahead_config["params"]) lr_scheduler = lr_scheduler_getter(optimizer) model = model.to(device) model.train() num_epochs = experiment_data["num_epochs"] batch_log_interval = experiment_data["batch_log_interval"] # "low" or "high" best_metric_type = experiment_data["best_metric_type"] saving_metric = experiment_data["saving_metric"] previous = float("inf") if best_metric_type == "low" else float("-inf") output_example_idx = (hasattr(train_loader.dataset, "output_idx") and train_loader.dataset.output_idx) for epoch in range(num_epochs): for output in train_loader: if output_example_idx: x_batch, y_batch, index = output extra_dict = train_loader.dataset.get_extra_dict(index) extra_dict = tensor_dict_to_device(extra_dict, device) else: x_batch, y_batch = output extra_dict = None optimizer.zero_grad() x_batch = x_batch.to(device) y_batch = y_batch.to(device) y_batch_hat = model(x_batch) losses_and_metrics = loss_metric(y_batch_hat, y_batch, extra_dict) loss = losses_and_metrics[loss_key] loss.backward() optimizer.step() global_step += 1 if global_step % batch_log_interval == 0: print("TRAIN:", get_losses_str(losses_and_metrics)) tb_log_metrics(train_writer, losses_and_metrics, global_step) # TODO: add support for softmax processing prediction = torch.sigmoid(y_batch_hat) plot_fig_from_batch( train_writer, x_batch, prediction, y_batch, global_step, ) # lr change after each batch if lr_scheduler_getter.step_type == "after_batch": lr_scheduler.step() # done with one epoch # let's validate (use code from the validation script) model.eval() all_losses_and_metrics = validate( val_loader, model, loss_metric, device, plotting_func=plot_fig_from_batch, plotting_dict=val_plotting_dict, writer=val_writer, global_step=global_step, val_metric_to_check=saving_metric, output_losses_list=False, ) print("Validation results for epoch {}".format(epoch)) print("VAL:", get_losses_str(all_losses_and_metrics, tensors=False)) model.train() current = all_losses_and_metrics[saving_metric] if is_better(current, previous, best_metric_type): print("Validation metric improved " "at the end of epoch {}".format(epoch)) previous = current save_val_metrics(all_losses_and_metrics, results_dir, epoch, global_step) out_path = os.path.join(checkpoints_dir, "best_val_checkpoint.pth") save_model_data(out_path, model, global_step) tb_log_metrics(val_writer, all_losses_and_metrics, global_step) # learning rate schedule step at the end of epoch if lr_scheduler_getter.step_type != "after_batch": if lr_scheduler_getter.step_type == "use_val": lr_scheduler.step(all_losses_and_metrics[loss_key]) elif lr_scheduler_getter.step_type == "use_epoch": lr_scheduler.step(epoch) else: lr_scheduler.step() # plot distinct learning rates in order they appear in the optimizer lr_dict = OrderedDict() for param_group in optimizer.param_groups: lr = param_group.get("lr") lr_dict[lr] = None for idx, lr in enumerate(lr_dict): tb_log_metrics(val_writer, {"lr_{}".format(idx): lr}, global_step) tb_log_metrics(train_writer, {"lr_{}".format(idx): lr}, global_step) train_writer.close() val_writer.close()
def main(): train_image_list = sorted( glob.glob( pathname= '../input/uavid-semantic-segmentation-dataset/train/train/*/Images/*.png', recursive=True)) train_mask_list = sorted( glob.glob(pathname='./trainlabels/*/TrainId/*.png', recursive=True)) valid_image_list = sorted( glob.glob( pathname= '../input/uavid-semantic-segmentation-dataset/valid/valid/*/Images/*.png', recursive=True)) valid_mask_list = sorted( glob.glob(pathname='./validlabels/*/TrainId/*.png', recursive=True)) preprocessing_fn = smp.encoders.get_preprocessing_fn( config.ENCODER, config.ENCODER_WEIGHTS) train_dataset = Dataset( train_image_list, train_mask_list, augmentation=augmentations.get_training_augmentation(), preprocessing=augmentations.get_preprocessing(preprocessing_fn), classes=config.CLASSES, ) valid_dataset = Dataset( valid_image_list, valid_mask_list, augmentation=augmentations.get_validation_augmentation(), preprocessing=augmentations.get_preprocessing(preprocessing_fn), classes=config.CLASSES, ) train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True, drop_last=False) loaders = {"train": train_loader, "valid": valid_loader} base_optimizer = RAdam([ { 'params': model.MODEL.decoder.parameters(), 'lr': config.LEARNING_RATE }, { 'params': model.MODEL.encoder.parameters(), 'lr': 1e-4 }, { 'params': model.MODEL.segmentation_head.parameters(), 'lr': config.LEARNING_RATE }, ]) optimizer = Lookahead(base_optimizer) criterion = BCEDiceLoss(activation=None) runner = SupervisedRunner() scheduler = OneCycleLRWithWarmup(optimizer, num_steps=config.NUM_EPOCHS, lr_range=(0.0016, 0.0000001), init_lr=config.LEARNING_RATE, warmup_steps=2) callbacks = [ IouCallback(activation='none'), ClasswiseIouCallback(classes=config.CLASSES, activation='none'), EarlyStoppingCallback(patience=config.ES_PATIENCE, metric='iou', minimize=False), ] runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=config.LOGDIR, num_epochs=config.NUM_EPOCHS, # save our best checkpoint by IoU metric main_metric="iou", # IoU needs to be maximized. minimize_metric=False, # for FP16. It uses the variable from the very first cell fp16=config.FP16_PARAMS, # prints train logs verbose=True, )
def train_segmentation_model( model: torch.nn.Module, logdir: str, num_epochs: int, loaders: Dict[str, DataLoader] ): criterion = { "dice": DiceLoss(), "iou": IoULoss(), "bce": nn.BCEWithLogitsLoss() } learning_rate = 0.001 encoder_learning_rate = 0.0005 layerwise_params = {"encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003)} model_params = utils.process_model_params(model, layerwise_params=layerwise_params) base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003) optimizer = Lookahead(base_optimizer) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) device = utils.get_device() runner = SupervisedRunner(device=device, input_key='image', input_target_key='mask') callbacks = [ CriterionCallback( input_key="mask", prefix="loss_dice", criterion_key="dice" ), CriterionCallback( input_key="mask", prefix="loss_iou", criterion_key="iou" ), CriterionCallback( input_key="mask", prefix="loss_bce", criterion_key="bce" ), MetricAggregationCallback( prefix="loss", mode="weighted_sum", metrics={"loss_dice": 1.0, "loss_iou": 1.0, "loss_bce": 0.8}, ), # metrics DiceCallback(input_key='mask'), IouCallback(input_key='mask'), ] runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=logdir, num_epochs=num_epochs, main_metric="iou", minimize_metric=False, verbose=True, load_best_on_end=True, ) best_model_save_dir = os.path.join(logdir, 'save') os.makedirs(best_model_save_dir, True) torch.save(model, os.path.join(best_model_save_dir, 'best_model.pth')) # save best model (by valid loss) batch = next(iter(loaders["valid"])) try: runner.trace(model=model, batch=batch, logdir=logdir, fp16=False) # optimized version (not all models can be traced) except Exception: pass