示例#1
0
文件: train.py 项目: gittigxuy/imet
def run() -> float:
    np.random.seed(0)
    model_dir = config.experiment_dir

    logger.info('=' * 50)

    train_loader, val_loader, test_loader = load_data(args.fold)
    logger.info(f'creating a model {config.model.arch}')
    model = create_model(config, pretrained=args.weights is None).cuda()
    criterion = get_loss(config)

    if args.summary:
        torchsummary.summary(model, (3, config.model.input_size, config.model.input_size))

    if args.lr_finder:
        optimizer = get_optimizer(config, model.parameters())
        lr_finder(train_loader, model, criterion, optimizer)
        sys.exit()

    if args.weights is None and config.train.head_only_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.steps} steps')
        logger.info(f'max_lr will be {config.train.warmup.max_lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer)

        freeze_layers(model)
        train_epoch(train_loader, model, criterion, optimizer, 0,
                    warmup_scheduler, None, config.train.warmup.steps)
        unfreeze_layers(model)

    if args.weights is None and config.train.enable_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.steps} steps')
        logger.info(f'max_lr will be {config.train.warmup.max_lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer)
        train_epoch(train_loader, model, criterion, optimizer, 0,
                    warmup_scheduler, None, config.train.warmup.steps)

    optimizer = get_optimizer(config, model.parameters())

    if args.weights is None:
        last_epoch = -1
    else:
        last_checkpoint = torch.load(args.weights)
        model_arch = last_checkpoint['arch'].replace('se_', 'se')

        if model_arch != config.model.arch:
            dprint(model_arch)
            dprint(config.model.arch)
            assert model_arch == config.model.arch

        model.load_state_dict(last_checkpoint['state_dict'])
        if 'optimizer' in last_checkpoint.keys():
            optimizer.load_state_dict(last_checkpoint['optimizer'])
        logger.info(f'checkpoint loaded: {args.weights}')

        last_epoch = last_checkpoint['epoch'] if 'epoch' in last_checkpoint.keys() else 99
        logger.info(f'loaded the model from epoch {last_epoch}')

        if args.lr != 0:
            set_lr(optimizer, float(args.lr))
        elif 'lr' in config.optimizer.params:
            set_lr(optimizer, config.optimizer.params.lr)
        elif 'base_lr' in config.scheduler.params:
            set_lr(optimizer, config.scheduler.params.base_lr)

    if not args.cosine:
        lr_scheduler = get_scheduler(config.scheduler, optimizer, last_epoch=
                                     (last_epoch if config.scheduler.name != 'cyclic_lr' else -1))
        assert config.scheduler2.name == ''
        lr_scheduler2 = get_scheduler(config.scheduler2, optimizer, last_epoch=last_epoch) \
                        if config.scheduler2.name else None
    else:
        epoch_size = min(len(train_loader), config.train.max_steps_per_epoch) \
                     * config.train.batch_size

        set_lr(optimizer, float(config.cosine.start_lr))
        lr_scheduler = CosineLRWithRestarts(optimizer,
                                            batch_size=config.train.batch_size,
                                            epoch_size=epoch_size,
                                            restart_period=config.cosine.period,
                                            period_inc=config.cosine.period_inc,
                                            max_period=config.cosine.max_period)
        lr_scheduler2 = None

    if args.predict_oof or args.predict_test:
        print('inference mode')
        assert args.weights is not None

        if args.predict_oof:
            gen_train_prediction(val_loader, model, last_epoch, args.weights)
        else:
            gen_test_prediction(test_loader, model, args.weights)

        sys.exit()

    logger.info(f'training will start from epoch {last_epoch + 1}')

    best_score = 0.0
    best_epoch = 0

    last_lr = get_lr(optimizer)
    best_model_path = args.weights

    for epoch in range(last_epoch + 1, config.train.num_epochs):
        logger.info('-' * 50)

        if not is_scheduler_continuous(lr_scheduler) and lr_scheduler2 is None:
            # if we have just reduced LR, reload the best saved model
            lr = get_lr(optimizer)

            if lr < last_lr - 1e-10 and best_model_path is not None:
                logger.info(f'learning rate dropped: {lr}, reloading')
                last_checkpoint = torch.load(best_model_path)

                assert(last_checkpoint['arch']==config.model.arch)
                model.load_state_dict(last_checkpoint['state_dict'])
                optimizer.load_state_dict(last_checkpoint['optimizer'])
                logger.info(f'checkpoint loaded: {best_model_path}')
                set_lr(optimizer, lr)
                last_lr = lr

        if config.train.lr_decay_coeff != 0 and epoch in config.train.lr_decay_milestones:
            n_cycles = config.train.lr_decay_milestones.index(epoch) + 1
            total_coeff = config.train.lr_decay_coeff ** n_cycles
            logger.info(f'artificial LR scheduler: made {n_cycles} cycles, decreasing LR by {total_coeff}')

            set_lr(optimizer, config.scheduler.params.base_lr * total_coeff)
            lr_scheduler = get_scheduler(config.scheduler, optimizer,
                                         coeff=total_coeff, last_epoch=-1)
                                         # (last_epoch if config.scheduler.name != 'cyclic_lr' else -1))

        if isinstance(lr_scheduler, CosineLRWithRestarts):
            restart = lr_scheduler.epoch_step()
            if restart:
                logger.info('cosine annealing restarted, resetting the best metric')
                best_score = min(config.cosine.min_metric_val, best_score)

        train_epoch(train_loader, model, criterion, optimizer, epoch,
                    lr_scheduler, lr_scheduler2, config.train.max_steps_per_epoch)
        score, _, _ = validate(val_loader, model, epoch)

        if type(lr_scheduler) == ReduceLROnPlateau:
            lr_scheduler.step(metrics=score)
        elif not is_scheduler_continuous(lr_scheduler):
            lr_scheduler.step()

        if type(lr_scheduler2) == ReduceLROnPlateau:
            lr_scheduler2.step(metrics=score)
        elif lr_scheduler2 and not is_scheduler_continuous(lr_scheduler2):
            lr_scheduler2.step()

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        if is_best:
            best_model_path = os.path.join(model_dir,
                f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth')

            data_to_save = {
                'epoch': epoch,
                'arch': config.model.arch,
                'state_dict': model.state_dict(),
                'score': score,
                'optimizer': optimizer.state_dict(),
                'config': config
            }

            torch.save(data_to_save, best_model_path)
            logger.info(f'a snapshot was saved to {best_model_path}')

    logger.info(f'best score: {best_score:.04f}')
    return -best_score
示例#2
0
def run() -> float:
    np.random.seed(0)
    model_dir = config.experiment_dir

    logger.info('=' * 50)
    # logger.info(f'hyperparameters: {params}')

    train_loader, val_loader, test_loader, label_encoder = load_data(args.fold)
    model = create_model()

    optimizer = get_optimizer(config, model.parameters())
    lr_scheduler = get_scheduler(config, optimizer)
    lr_scheduler2 = get_scheduler(
        config, optimizer) if config.scheduler2.name else None
    criterion = get_loss(config)

    if args.weights is None:
        last_epoch = 0
        logger.info(f'training will start from epoch {last_epoch+1}')
    else:
        last_checkpoint = torch.load(args.weights)
        assert last_checkpoint['arch'] == config.model.arch
        model.load_state_dict(last_checkpoint['state_dict'])
        optimizer.load_state_dict(last_checkpoint['optimizer'])
        logger.info(f'checkpoint {args.weights} was loaded.')

        last_epoch = last_checkpoint['epoch']
        logger.info(f'loaded the model from epoch {last_epoch}')

        if args.lr_override != 0:
            set_lr(optimizer, float(args.lr_override))
        elif 'lr' in config.scheduler.params:
            set_lr(optimizer, config.scheduler.params.lr)

    if args.gen_predict:
        print('inference mode')
        generate_submission(val_loader, test_loader, model, label_encoder,
                            last_epoch, args.weights)
        sys.exit(0)

    if args.gen_features:
        print('inference mode')
        generate_features(test_loader, model, args.weights)
        sys.exit(0)

    best_score = 0.0
    best_epoch = 0

    last_lr = get_lr(optimizer)
    best_model_path = args.weights

    for epoch in range(last_epoch + 1, config.train.num_epochs + 1):
        logger.info('-' * 50)

        # if not is_scheduler_continuous(config.scheduler.name):
        #     # if we have just reduced LR, reload the best saved model
        #     lr = get_lr(optimizer)
        #     logger.info(f'learning rate {lr}')
        #
        #     if lr < last_lr - 1e-10 and best_model_path is not None:
        #         last_checkpoint = torch.load(os.path.join(model_dir, best_model_path))
        #         assert(last_checkpoint['arch']==config.model.arch)
        #         model.load_state_dict(last_checkpoint['state_dict'])
        #         optimizer.load_state_dict(last_checkpoint['optimizer'])
        #         logger.info(f'checkpoint {best_model_path} was loaded.')
        #         set_lr(optimizer, lr)
        #         last_lr = lr
        #
        #     if lr < config.train.min_lr * 1.01:
        #         logger.info('reached minimum LR, stopping')
        #         break

        get_lr(optimizer)

        train(train_loader, model, criterion, optimizer, epoch, lr_scheduler,
              lr_scheduler2)
        score = validate(val_loader, model, epoch)

        if not is_scheduler_continuous(config.scheduler.name):
            lr_scheduler.step(score)
        if lr_scheduler2 and not is_scheduler_continuous(
                config.scheduler.name):
            lr_scheduler2.step(score)

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        data_to_save = {
            'epoch': epoch,
            'arch': config.model.arch,
            'state_dict': model.state_dict(),
            'best_score': best_score,
            'score': score,
            'optimizer': optimizer.state_dict(),
            'options': config
        }

        filename = config.version
        if is_best:
            best_model_path = f'{filename}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth'
            save_checkpoint(data_to_save, best_model_path, model_dir)

    logger.info(f'best score: {best_score:.04f}')
    return -best_score
示例#3
0
文件: train.py 项目: gittigxuy/imet
def lr_finder(train_loader: Any, model: Any, criterion: Any, optimizer: Any) -> None:
    ''' Finds the optimal LR range and sets up first optimizer parameters. '''
    logger.info('lr_finder called')

    batch_time = AverageMeter()
    num_steps = min(len(train_loader), config.train.lr_finder.num_steps)
    logger.info(f'total batches: {num_steps}')
    end = time.time()
    lr_str = ''
    model.train()

    init_value = config.train.lr_finder.init_value
    final_value = config.train.lr_finder.final_value
    beta = config.train.lr_finder.beta

    mult = (final_value / init_value) ** (1 / (num_steps - 1))
    lr = init_value

    avg_loss = best_loss = 0.0
    losses = np.zeros(num_steps)
    logs = np.zeros(num_steps)

    for i, (input_, target) in enumerate(train_loader):
        if i >= num_steps:
            break

        set_lr(optimizer, lr)

        output = model(input_.cuda())
        loss = criterion(output, target.cuda())
        loss_val = loss.data.item()

        predict = (output.detach() > 0.1).type(torch.FloatTensor)
        f2 = F_score(predict, target, beta=2)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        lr_str = f'\tlr {lr:.08f}'

        # compute the smoothed loss
        avg_loss = beta * avg_loss + (1 - beta) * loss_val
        smoothed_loss = avg_loss / (1 - beta ** (i + 1))

        # stop if the loss is exploding
        if i > 0 and smoothed_loss > 4 * best_loss:
            break

        # record the best loss
        if smoothed_loss < best_loss or i == 0:
            best_loss = smoothed_loss

        # store the values
        losses[i] = smoothed_loss
        logs[i] = math.log10(lr)

        # update the lr for the next step
        lr *= mult

        batch_time.update(time.time() - end)
        end = time.time()

        if i % config.train.log_freq == 0:
            logger.info(f'lr_finder [{i}/{num_steps}]\t'
                        f'time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        f'loss {loss:.4f} ({smoothed_loss:.4f})\t'
                        f'F2 {f2:.4f} {lr_str}')

    np.savez(os.path.join(config.experiment_dir, f'lr_finder_{config.version}'),
             logs=logs, losses=losses)

    d1 = np.zeros_like(losses); d1[1:] = losses[1:] - losses[:-1]
    first, last = np.argmin(d1), np.argmin(losses)

    MAGIC_COEFF = 4

    highest_lr = 10 ** logs[last]
    best_high_lr = highest_lr / MAGIC_COEFF
    best_low_lr = 10 ** logs[first]
    logger.info(f'best_low_lr={best_low_lr} best_high_lr={best_high_lr} '
                f'highest_lr={highest_lr}')

    def find_nearest(array: np.array, value: float) -> int:
        return (np.abs(array - value)).argmin()

    last = find_nearest(logs, math.log10(best_high_lr))
    logger.info(f'first={first} last={last}')

    import matplotlib.pyplot as plt
    plt.plot(logs, losses, '-D', markevery=[first, last])
    plt.savefig(os.path.join(config.experiment_dir, 'lr_finder_plot.png'))
示例#4
0
def run(hyperparams: Optional[Dict[str, str]] = None) -> float:
    np.random.seed(0)
    logger.info('=' * 50)

    if hyperparams:
        hash = hashlib.sha224(str(hyperparams).encode()).hexdigest()[:8]
        model_dir = os.path.join(config.general.experiment_dir, f'{hash}')

        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        str_params = str(hyperparams)
        logger.info(f'hyperparameters: {hyperparams}')
        config.augmentations.update(hyperparams)
    else:
        model_dir = config.general.experiment_dir

    train_loader, val_loader, test_loader = load_data(args.fold)
    epoch_size = min(len(train_loader), config.train.max_steps_per_epoch)

    logger.info(f'creating a model {config.model.arch}')
    model = create_model(config, pretrained=args.weights is None).cuda()
    criterion = get_loss(config)

    if args.summary:
        torchsummary.summary(
            model, (config.model.num_channels * 2, config.model.input_size,
                    config.model.input_size))

    if args.lr_finder:
        optimizer = get_optimizer(config, model.parameters())
        lr_finder(train_loader, model, criterion, optimizer)
        sys.exit()

    if args.weights is None and config.train.head_only_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.epochs} epochs')
        logger.info(f'max lr will be {config.optimizer.params.lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer, epoch_size)
        freeze_layers(model)

        for epoch in range(config.train.warmup.epochs):
            train_epoch(train_loader, model, criterion, optimizer, epoch,
                        warmup_scheduler)

        unfreeze_layers(model)

    if args.weights is None and config.train.enable_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.epochs} epochs')
        logger.info(f'max lr will be {config.optimizer.params.lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer, epoch_size)

        for epoch in range(config.train.warmup.epochs):
            train_epoch(train_loader, model, criterion, optimizer, epoch,
                        warmup_scheduler)

    optimizer = get_optimizer(config, model.parameters())

    if args.weights is None:
        last_epoch = -1
    else:
        last_checkpoint = torch.load(args.weights)
        model_arch = last_checkpoint['arch'].replace('se_', 'se')

        if model_arch != config.model.arch:
            dprint(model_arch)
            dprint(config.model.arch)
            assert model_arch == config.model.arch

        model.load_state_dict(last_checkpoint['state_dict'])
        if 'optimizer' in last_checkpoint.keys():
            optimizer.load_state_dict(last_checkpoint['optimizer'])
        logger.info(f'checkpoint loaded: {args.weights}')

        last_epoch = last_checkpoint[
            'epoch'] if 'epoch' in last_checkpoint.keys() else 99
        logger.info(f'loaded the model from epoch {last_epoch}')

        if args.lr != 0:
            set_lr(optimizer, float(args.lr))
        elif 'lr' in config.optimizer.params:
            set_lr(optimizer, config.optimizer.params.lr)
        elif 'base_lr' in config.scheduler.params:
            set_lr(optimizer, config.scheduler.params.base_lr)

    lr_scheduler = get_scheduler(config, optimizer, epoch_size=epoch_size)

    if args.predict_oof or args.predict_test:
        print('inference mode')
        assert args.weights is not None

        if args.predict_oof:
            gen_train_prediction(val_loader, model, last_epoch, args.weights)
        else:
            gen_test_prediction(test_loader, model, args.weights)

        sys.exit()

    logger.info(f'training will start from epoch {last_epoch + 1}')

    best_score = 0.0
    best_epoch = 0

    last_lr = get_lr(optimizer)
    best_model_path = args.weights

    for epoch in range(last_epoch + 1, config.train.num_epochs):
        logger.info('-' * 50)

        if not is_scheduler_continuous(lr_scheduler):
            # if we have just reduced LR, reload the best saved model
            lr = get_lr(optimizer)

            if lr < last_lr - 1e-10 and best_model_path is not None:
                logger.info(f'learning rate dropped: {lr}, reloading')
                last_checkpoint = torch.load(best_model_path)

                assert (last_checkpoint['arch'] == config.model.arch)
                model.load_state_dict(last_checkpoint['state_dict'])
                optimizer.load_state_dict(last_checkpoint['optimizer'])
                logger.info(f'checkpoint loaded: {best_model_path}')
                set_lr(optimizer, lr)
                last_lr = lr

        if isinstance(lr_scheduler, CosineLRWithRestarts):
            restart = lr_scheduler.epoch_step()
            if restart:
                logger.info(
                    'cosine annealing restarted, resetting the best metric')
                best_score = min(config.train.restart_metric_val, best_score)

        train_epoch(train_loader, model, criterion, optimizer, epoch,
                    lr_scheduler)
        score, _ = validate(val_loader, model, epoch)

        if type(lr_scheduler) == ReduceLROnPlateau:
            lr_scheduler.step(metrics=score)
        elif not is_scheduler_continuous(lr_scheduler):
            lr_scheduler.step()

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        if is_best:
            best_model_path = os.path.join(
                model_dir,
                f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth')

            data_to_save = {
                'epoch': epoch,
                'arch': config.model.arch,
                'state_dict': model.state_dict(),
                'score': score,
                'optimizer': optimizer.state_dict(),
                'config': config
            }

            torch.save(data_to_save, best_model_path)
            logger.info(f'a snapshot was saved to {best_model_path}')

    logger.info(f'best score: {best_score:.04f}')
    return -best_score