示例#1
0
def run(config_file):
    config = load_config(config_file)

    os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(
            data_folder=config.data.train_dir,
            df_path=config.data.train_df_path,
            phase=phase,
            batch_size=config.train.batch_size,
            num_workers=config.num_workers,
            idx_fold=config.data.params.idx_fold,
            transforms=all_transforms[phase],
            num_classes=config.data.num_classes,
            pseudo_label_path=config.train.pseudo_label_path,
            task='cls'
        )
        for phase in ['train', 'valid']
    }

    # create model
    model = CustomNet(config.model.encoder, config.data.num_classes)

    # train setting
    criterion = get_loss(config)
    params = [
        {'params': model.base_params(), 'lr': config.optimizer.params.encoder_lr},
        {'params': model.fresh_params(), 'lr': config.optimizer.params.decoder_lr}
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model)

    callbacks = [MultiClassAccuracyCallback(threshold=0.5), F1ScoreCallback()]
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth'))

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        callbacks=callbacks,
        verbose=True,
        fp16=True,
    )
示例#2
0
def get_callbacks(class_names):
    num_classes = len(class_names)
    return [
        AccuracyCallback(num_classes=num_classes),
        AUCCallback(
            num_classes=num_classes,
            input_key="targets_one_hot",
            class_names=class_names
        ),
        F1ScoreCallback(
            input_key="targets_one_hot",
            activation="Softmax"
        )
    ]
示例#3
0
def train(num_epochs, model, loaders, logdir):
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=2)

    callbacks = [F1ScoreCallback()]

    # model runner
    runner = SupervisedRunner()

    # model training
    runner.train(model=model,
                 criterion=criterion,
                 optimizer=optimizer,
                 scheduler=scheduler,
                 loaders=loaders,
                 logdir=logdir,
                 num_epochs=num_epochs,
                 callbacks=callbacks,
                 verbose=True)
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        # We can specify the callbacks list for the experiment;
        # For this task, we will check accuracy, AUC and F1 metrics
        callbacks=[
            AccuracyCallback(num_classes=config.num_classes),
            AUCCallback(
                num_classes=config.num_classes,
                input_key="targets_one_hot",
                class_names=config.class_names
            ),
            F1ScoreCallback(
                input_key="targets_one_hot",
                activation="Softmax"
            ),
            CheckpointCallback(
                save_n_best=1,
                #             resume_dir="./models/classification",
                metrics_filename="metrics.json"
            ),
            EarlyStoppingCallback(
                patience=config.patience,
                metric="auc/_mean",
                minimize=False
            )
        ],
        # path to save logs
        logdir=config.logdir,
示例#5
0
    optimizer, factor=0.25, patience=3)
num_epochs = args.e
logdir = "./logs/effnet-b0"
fp16_params = None  # dict(opt_level="O1")
runner = SupervisedRunner(device='cuda')


runner.train(
    model=model,
    criterion=criterion,
    scheduler=scheduler,
    optimizer=optimizer,
    loaders=loaders,
    callbacks=[
        # wAUC(),
        F1ScoreCallback(),
        AUCCallback(num_classes=4),
        AccuracyCallback(prefix='ACC'),
        OptimizerCallback(accumulation_steps=args.acc)],
    logdir=logdir,
    num_epochs=num_epochs,
    fp16=fp16_params,
    verbose=True
)
if args.test > 0:
    test_preds_proba: Union[List, Iterable, np.ndarray] = []
    model.eval()
    progress_bar_test = tqdm(test_dataset)
    with torch.no_grad():
        for i, im in enumerate(progress_bar_test):
            inputs = im.to('cuda')
    "valid": valid_dl
}
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD([
    {'params': model.layer1.parameters(), 'lr': LR / 10},
    {'params': model.layer2.parameters(), 'lr': LR / 5},
    {'params': model.layer3.parameters(), 'lr': LR / 2},
    {'params': model.layer4.parameters(), 'lr': LR / 1},
], lr=LR)
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=[LR / 10, LR / 5, LR / 2, LR / 1], total_steps=100)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-7)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, cooldown=2, min_lr=1e-7)

callbacks = [
    AccuracyCallback(num_classes=5, threshold=0.5, activation='Softmax'),
    F1ScoreCallback(input_key="targets_one_hot", activation='Softmax', threshold=0.5),
]
runner = SupervisedRunner()

## Step 1.

runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    callbacks=callbacks,
    loaders=loaders,
    logdir=logdir,
    num_epochs=num_epochs,
    verbose=1,
    scheduler=scheduler,
def get_callbacks(num_classes):
    callbacks = [
        AccuracyCallback(num_classes=num_classes),
        F1ScoreCallback(input_key="targets_one_hot", activation="Softmax")
    ]
    return callbacks
示例#8
0
def main():

    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('--seed', type=int, default=1234, help='Random seed')
    arg('--model-name',
        type=str,
        default=Path('seresnext101'),
        help='String model name used for saving')
    arg('--run-root',
        type=Path,
        default=Path('../results'),
        help='Directory for saving model')
    arg('--data-root', type=Path, default=Path('../data'))
    arg('--image-size', type=int, default=224, help='Image size for training')
    arg('--batch-size',
        type=int,
        default=16,
        help='Batch size during training')
    arg('--fold', type=int, default=0, help='Validation fold')
    arg('--n-epochs', type=int, default=10, help='Epoch to run')
    arg('--learning-rate',
        type=float,
        default=1e-3,
        help='Initial learning rate')
    arg('--step', type=int, default=1, help='Current training step')
    arg('--patience', type=int, default=4)
    arg('--criterion', type=str, default='bce', help='Criterion')
    arg('--optimizer', default='Adam', help='Name of the optimizer')
    arg('--continue_train', type=bool, default=False)
    arg('--checkpoint',
        type=str,
        default=Path('../results'),
        help='Checkpoint file path')
    arg('--workers', type=int, default=2)
    arg('--debug', type=bool, default=True)
    args = parser.parse_args()

    set_seed(args.seed)
    """
    
    SET PARAMS
    
    """
    args.debug = True
    ON_KAGGLE = configs.ON_KAGGLE
    N_CLASSES = configs.NUM_CLASSES
    args.image_size = configs.SIZE
    args.data_root = configs.DATA_ROOT
    use_cuda = cuda.is_available()
    fold = args.fold
    num_workers = args.workers
    num_epochs = args.n_epochs
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    """

    LOAD DATA
    
    """
    print(os.listdir(args.data_root))
    folds = pd.read_csv(args.data_root / 'folds.csv')
    train_root = args.data_root / 'train'

    if args.debug:
        folds = folds.head(50)
    train_fold = folds[folds['fold'] != fold]
    valid_fold = folds[folds['fold'] == fold]
    check_fold(train_fold, valid_fold)

    def get_dataloader(df: pd.DataFrame, image_transform) -> DataLoader:
        """
        Calls dataloader to load Imet Dataset
        """
        return DataLoader(
            ImetDataset(train_root, df, image_transform),
            shuffle=True,
            batch_size=batch_size,
            num_workers=num_workers,
        )

    train_loader = get_dataloader(train_fold, image_transform=albu_transform)
    valid_loader = get_dataloader(valid_fold, image_transform=valid_transform)
    print('{} items in train, {} in valid'.format(len(train_loader.dataset),
                                                  len(valid_loader.dataset)))
    loaders = OrderedDict()
    loaders["train"] = train_loader
    loaders["valid"] = valid_loader
    """
    
    MODEL
    
    """
    model = seresnext101(num_classes=N_CLASSES)
    if use_cuda:
        model = model.cuda()

    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               factor=0.5,
                                               patience=args.patience)
    """
    
    MODEL RUNNER
    
    """
    # call an instance of the model runner
    runner = SupervisedRunner()
    # logs folder
    current_time = datetime.now().strftime('%b%d_%H_%M')
    prefix = f'{current_time}_{args.model_name}'
    logdir = os.path.join(args.run_root, prefix)
    os.makedirs(logdir, exist_ok=False)

    print('\tTrain session    :', prefix)
    print('\tOn KAGGLE      :', ON_KAGGLE)
    print('\tDebug          :', args.debug)
    print('\tClasses number :', N_CLASSES)
    print('\tModel          :', args.model_name)
    print('\tParameters     :', model.parameters())
    print('\tImage size     :', args.image_size)
    print('\tEpochs         :', num_epochs)
    print('\tWorkers        :', num_workers)
    print('\tLog dir        :', logdir)
    print('\tLearning rate  :', learning_rate)
    print('\tBatch size     :', batch_size)
    print('\tPatience       :', args.patience)

    if args.continue_train:
        state = load_model(model, args.checkpoint)
        epoch = state['epoch']
        step = state['step']
        print('Loaded model weights from {}, epoch {}, step {}'.format(
            args.checkpoint, epoch, step))

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        callbacks=[
            F1ScoreCallback(threshold=0.5),
            #F2ScoreCallback(num_classes=N_CLASSES),
            EarlyStoppingCallback(patience=args.patience, min_delta=0.01)
        ],
        logdir=logdir,
        num_epochs=num_epochs,
        verbose=True)

    # by default it only plots loss, works in IPython Notebooks
    #utils.plot_metrics(logdir=logdir, metrics=["loss", "_base/lr"])
    """
    
    INFERENCE TEST
    
    """
    loaders = OrderedDict([("infer", loaders["train"])])
    runner.infer(
        model=model,
        loaders=loaders,
        callbacks=[
            CheckpointCallback(resume=f"{logdir}/checkpoints/best.pth"),
            InferCallback()
        ],
    )
    print(runner.callbacks[1].predictions["logits"])
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=42, help='Random seed')
    parser.add_argument('--fast', action='store_true')
    parser.add_argument('-dd',
                        '--data-dir',
                        type=str,
                        default='data',
                        help='Data directory for INRIA sattelite dataset')
    parser.add_argument('-m',
                        '--model',
                        type=str,
                        default='cls_resnet18',
                        help='')
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=8,
                        help='Batch Size during training, e.g. -b 64')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='Epoch to run')
    parser.add_argument('-es',
                        '--early-stopping',
                        type=int,
                        default=None,
                        help='Maximum number of epochs without improvement')
    parser.add_argument('-fe', '--freeze-encoder', action='store_true')
    parser.add_argument('-lr',
                        '--learning-rate',
                        type=float,
                        default=1e-4,
                        help='Initial learning rate')
    parser.add_argument('-l',
                        '--criterion',
                        type=str,
                        default='bce',
                        help='Criterion')
    parser.add_argument('-o',
                        '--optimizer',
                        default='Adam',
                        help='Name of the optimizer')
    parser.add_argument(
        '-c',
        '--checkpoint',
        type=str,
        default=None,
        help='Checkpoint filename to use as initial model weights')
    parser.add_argument('-w',
                        '--workers',
                        default=multiprocessing.cpu_count(),
                        type=int,
                        help='Num workers')
    parser.add_argument('-a',
                        '--augmentations',
                        default='hard',
                        type=str,
                        help='')
    parser.add_argument('-tta',
                        '--tta',
                        default=None,
                        type=str,
                        help='Type of TTA to use [fliplr, d4]')
    parser.add_argument('-tm',
                        '--train-mode',
                        default='random',
                        type=str,
                        help='')
    parser.add_argument('-rm',
                        '--run-mode',
                        default='fit_predict',
                        type=str,
                        help='')
    parser.add_argument('--transfer', default=None, type=str, help='')
    parser.add_argument('--fp16', action='store_true')

    args = parser.parse_args()
    set_manual_seed(args.seed)

    data_dir = args.data_dir
    num_workers = args.workers
    num_epochs = args.epochs
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    early_stopping = args.early_stopping
    model_name = args.model
    optimizer_name = args.optimizer
    image_size = (512, 512)
    fast = args.fast
    augmentations = args.augmentations
    train_mode = args.train_mode
    run_mode = args.run_mode
    log_dir = None
    fp16 = args.fp16
    freeze_encoder = args.freeze_encoder

    run_train = run_mode == 'fit_predict' or run_mode == 'fit'
    run_predict = run_mode == 'fit_predict' or run_mode == 'predict'

    model = maybe_cuda(get_model(model_name, num_classes=1))

    if args.transfer:
        transfer_checkpoint = fs.auto_file(args.transfer)
        print("Transfering weights from model checkpoint", transfer_checkpoint)
        checkpoint = load_checkpoint(transfer_checkpoint)
        pretrained_dict = checkpoint['model_state_dict']

        for name, value in pretrained_dict.items():
            try:
                model.load_state_dict(collections.OrderedDict([(name, value)]),
                                      strict=False)
            except Exception as e:
                print(e)

    checkpoint = None
    if args.checkpoint:
        checkpoint = load_checkpoint(fs.auto_file(args.checkpoint))
        unpack_checkpoint(checkpoint, model=model)

        checkpoint_epoch = checkpoint['epoch']
        print('Loaded model weights from:', args.checkpoint)
        print('Epoch                    :', checkpoint_epoch)
        print('Metrics (Train):', 'f1  :',
              checkpoint['epoch_metrics']['train']['f1_score'], 'loss:',
              checkpoint['epoch_metrics']['train']['loss'])
        print('Metrics (Valid):', 'f1  :',
              checkpoint['epoch_metrics']['valid']['f1_score'], 'loss:',
              checkpoint['epoch_metrics']['valid']['loss'])

        log_dir = os.path.dirname(
            os.path.dirname(fs.auto_file(args.checkpoint)))

    if run_train:

        if freeze_encoder:
            set_trainable(model.encoder, trainable=False, freeze_bn=True)

        criterion = get_loss(args.criterion)
        parameters = get_optimizable_parameters(model)
        optimizer = get_optimizer(optimizer_name, parameters, learning_rate)

        if checkpoint is not None:
            try:
                unpack_checkpoint(checkpoint, optimizer=optimizer)
                print('Restored optimizer state from checkpoint')
            except Exception as e:
                print('Failed to restore optimizer state from checkpoint', e)

        train_loader, valid_loader = get_dataloaders(
            data_dir=data_dir,
            batch_size=batch_size,
            num_workers=num_workers,
            image_size=image_size,
            augmentation=augmentations,
            fast=fast)

        loaders = collections.OrderedDict()
        loaders["train"] = train_loader
        loaders["valid"] = valid_loader

        current_time = datetime.now().strftime('%b%d_%H_%M')
        prefix = f'adversarial/{args.model}/{current_time}_{args.criterion}'

        if fp16:
            prefix += '_fp16'

        if fast:
            prefix += '_fast'

        log_dir = os.path.join('runs', prefix)
        os.makedirs(log_dir, exist_ok=False)

        scheduler = MultiStepLR(optimizer,
                                milestones=[10, 30, 50, 70, 90],
                                gamma=0.5)

        print('Train session    :', prefix)
        print('\tFP16 mode      :', fp16)
        print('\tFast mode      :', args.fast)
        print('\tTrain mode     :', train_mode)
        print('\tEpochs         :', num_epochs)
        print('\tEarly stopping :', early_stopping)
        print('\tWorkers        :', num_workers)
        print('\tData dir       :', data_dir)
        print('\tLog dir        :', log_dir)
        print('\tAugmentations  :', augmentations)
        print('\tTrain size     :', len(train_loader),
              len(train_loader.dataset))
        print('\tValid size     :', len(valid_loader),
              len(valid_loader.dataset))
        print('Model            :', model_name)
        print('\tParameters     :', count_parameters(model))
        print('\tImage size     :', image_size)
        print('\tFreeze encoder :', freeze_encoder)
        print('Optimizer        :', optimizer_name)
        print('\tLearning rate  :', learning_rate)
        print('\tBatch size     :', batch_size)
        print('\tCriterion      :', args.criterion)

        # model training
        visualization_fn = partial(draw_classification_predictions,
                                   class_names=['Train', 'Test'])

        callbacks = [
            F1ScoreCallback(),
            AUCCallback(),
            ShowPolarBatchesCallback(visualization_fn,
                                     metric='f1_score',
                                     minimize=False),
        ]

        if early_stopping:
            callbacks += [
                EarlyStoppingCallback(early_stopping,
                                      metric='auc',
                                      minimize=False)
            ]

        runner = SupervisedRunner(input_key='image')
        runner.train(fp16=fp16,
                     model=model,
                     criterion=criterion,
                     optimizer=optimizer,
                     scheduler=scheduler,
                     callbacks=callbacks,
                     loaders=loaders,
                     logdir=log_dir,
                     num_epochs=num_epochs,
                     verbose=True,
                     main_metric='auc',
                     minimize_metric=False,
                     state_kwargs={"cmd_args": vars(args)})

    if run_predict and not fast:
        # Training is finished. Let's run predictions using best checkpoint weights
        best_checkpoint = load_checkpoint(
            fs.auto_file('best.pth', where=log_dir))
        unpack_checkpoint(best_checkpoint, model=model)

        model.eval()
        torch.no_grad()

        train_csv = pd.read_csv(os.path.join(data_dir, 'train.csv'))
        train_csv['id_code'] = train_csv['id_code'].apply(
            lambda x: os.path.join(data_dir, 'train_images', f'{x}.png'))
        test_ds = RetinopathyDataset(train_csv['id_code'],
                                     None,
                                     get_test_aug(image_size),
                                     target_as_array=True)
        test_dl = DataLoader(test_ds,
                             batch_size,
                             pin_memory=True,
                             num_workers=num_workers)

        test_ids = []
        test_preds = []

        for batch in tqdm(test_dl, desc='Inference'):
            input = batch['image'].cuda()
            outputs = model(input)
            predictions = to_numpy(outputs['logits'].sigmoid().squeeze(1))
            test_ids.extend(batch['image_id'])
            test_preds.extend(predictions)

        df = pd.DataFrame.from_dict({
            'id_code': test_ids,
            'is_test': test_preds
        })
        df.to_csv(os.path.join(log_dir, 'test_in_train.csv'), index=None)
示例#10
0
	print('Training only head for {} epochs with inital lr {}'.format(head_n_epochs, head_lr))
	for p in model.parameters():
		p.requires_grad = False
	for p in model._fc.parameters():
		p.requires_grad = True
	optimizer = torch.optim.Adam(model.parameters(), lr=head_lr)
	criterion = nn.CrossEntropyLoss()
	scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.75, patience=2)
	runner.train(model=model,
			criterion=criterion,
			optimizer=optimizer,
			loaders=loaders,
			logdir=logdir,
			scheduler=scheduler,
			callbacks=[
                            F1ScoreCallback(), MixupCallback()
				],
			num_epochs=head_n_epochs,
			verbose=True)      
	print('Train whole net for {} epochs with initial lr {}'.format(full_n_epochs, full_lr))
	for p in model.parameters():
		p.requires_grad = True
	optimizer = torch.optim.Adam(model.parameters(), lr=full_lr)
	criterion = nn.CrossEntropyLoss()
	scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5)
	runner.train(model=model,
			criterion=criterion,
			optimizer=optimizer,
			loaders=loaders,
			logdir=logdir,
			scheduler=scheduler,