Exemplo n.º 1
0
def main():
    torch.cuda.set_device(1)
    # get dataframe
    df, out_dim = get_df(args.kernel_type, args.data_dir, args.train_step)
    print(f"out_dim = {out_dim}")

    # get adaptive margin
    tmp = np.sqrt(1 / np.sqrt(df['landmark_id'].value_counts().sort_index().values))
    margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * 0.45 + 0.05

    # get augmentations
    transforms_train, transforms_val = get_transforms(args.image_size)
    print("\ndata augmentation is done!\n")

    #extract images in folder 0 as demo
    df_demo_0 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/0')]
    df_demo_1 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/1')]
    df_demo_2 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/2')]
    df_demo_3 = df[df['filepath'].str.startswith('/mnt/data/sjx/CS498_DL_Project/data/train/0/3')]

    df_demo = df_demo_0.append([df_demo_1, df_demo_2, df_demo_3])
    # get train and valid dataset
    df = df_demo

    df_train = df[df['fold'] != args.fold]
    df_valid = df[df['fold'] == args.fold].reset_index(drop=True).query("index % 15==0")

    dataset_train = LandmarkDataset(df_train, 'train', 'train', transform=transforms_train)
    dataset_valid = LandmarkDataset(df_valid, 'train', 'val', transform=transforms_val)
    valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers, drop_last=True)

    print("dataset has been prepared!\n")
    # model
    print(torch.cuda.current_device())
    model = ModelClass(args.enet_type, out_dim=out_dim)
    model = nn.DataParallel(model, device_ids=[1, 3]).to("cuda:1, 3")

    # loss func
    def criterion(logits_m, target):
        arc = ArcFaceLossAdaptiveMargin(margins=margins, s=80)
        loss_m = arc(logits_m, target, out_dim)
        return loss_m

    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.init_lr)

    # load pretrained
    if len(args.load_from) > 0:
        # Todo:
        checkpoint = torch.load(args.load_from, map_location=lambda storage, loc: storage.cuda(3))
        state_dict = checkpoint['model_state_dict']
        state_dict = {k[7:] if k.startswith('module.') else k: state_dict[k] for k in state_dict.keys()}    
        if args.train_step == 1:
            del state_dict['metric_classify.weight']
            model.load_state_dict(state_dict, strict=False)
        else:
            model.load_state_dict(state_dict, strict=True)        
#             if 'optimizer_state_dict' in checkpoint:
#                 optimizer.load_state_dict(checkpoint['optimizer_state_dict'])   
        del checkpoint, state_dict
        torch.cuda.empty_cache()
        import gc
        gc.collect()


    # lr scheduler
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, args.n_epochs-1)
    scheduler_warmup = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine)

    # train & valid loop
    gap_m_max = 0.
    model_file = os.path.join(args.model_dir, f'{args.kernel_type}_fold{args.fold}.pth')
    for epoch in range(args.start_from_epoch, args.n_epochs+1):

        print(time.ctime(), 'Epoch:', epoch)
        train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size,
                                                   num_workers=args.num_workers,
                                                   shuffle=True, drop_last=True)

        train_loss = train_epoch(model, train_loader, optimizer, criterion)
        val_loss, acc_m, gap_m = val_epoch(model, valid_loader, criterion)
        scheduler_warmup.step(epoch-1)
        if args.local_rank == 0:
            content = time.ctime() + ' ' + f'Fold {args.fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, valid loss: {(val_loss):.5f}, acc_m: {(acc_m):.6f}, gap_m: {(gap_m):.6f}.'
            print(content)
            with open(os.path.join(args.log_dir, f'{args.kernel_type}.txt'), 'a') as appender:
                appender.write(content + '\n')

            print('gap_m_max ({:.6f} --> {:.6f}). Saving model ...'.format(gap_m_max, gap_m))
            torch.save({
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        }, model_file)            
            gap_m_max = gap_m

        if epoch == args.stop_at_epoch:
            print(time.ctime(), 'Training Finished!')
            break

    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, os.path.join(args.model_dir, f'{args.kernel_type}_fold{args.fold}_final.pth'))
Exemplo n.º 2
0
def run(fold, df, meta_features, n_meta_features, transforms_train,
        transforms_val, mel_idx):
    if args.DEBUG:
        args.n_epochs = 5
        df_train = df[df['fold'] != fold].sample(args.batch_size * 5)
        df_valid = df[df['fold'] == fold].sample(args.batch_size * 5)
    else:
        df_train = df[df['fold'] != fold]
        df_valid = df[df['fold'] == fold]

    dataset_train = MelanomaDataset(df_train,
                                    'train',
                                    meta_features,
                                    transform=transforms_train)
    dataset_valid = MelanomaDataset(df_valid,
                                    'valid',
                                    meta_features,
                                    transform=transforms_val)
    train_loader = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=args.batch_size,
        sampler=RandomSampler(dataset_train),
        num_workers=args.num_workers)  # 随机不重复采样
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers)
    model = ModelClass(
        args.enet_type,
        n_meta_features=n_meta_features,
        n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')],
        out_dim=args.out_dim,
        pretrained=True)
    if DP:
        model = apex.parallel.convert_syncbn_model(model)
    model = model.to(device)

    auc_max = 0.
    auc_20_max = 0.
    model_file = os.path.join(args.model_dir,
                              f'{args.kernel_type}_best_fold{fold}.pth')
    model_file2 = os.path.join(args.model_dir,
                               f'{args.kernel_type}_best_20_fold{fold}.pth')
    model_file3 = os.path.join(args.model_dir,
                               f'{args.kernel_type}_final_fold{fold}.pth')

    optimizer = optim.Adam(model.parameters(), lr=args.init_lr)
    if args.use_amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    if DP:
        model = nn.DataParallel(model)
    #     scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs - 1)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, args.n_epochs - 1)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=1,
        after_scheduler=scheduler_cosine)

    print(len(dataset_train), len(dataset_valid))
    for epoch in range(1, args.n_epochs + 1):
        print(time.ctime(), f'Epoch {epoch}', f'Fold {fold}')
        #         scheduler_warmup.step(epoch - 1)

        train_loss = train_epoch(model, train_loader, optimizer)
        val_loss, acc, auc, auc_20 = val_epoch(
            model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values)

        content = time.ctime(
        ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.'
        print(content)
        with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'),
                  'a') as appender:
            appender.write(content + '\n')

        scheduler_warmup.step()
        if epoch == 2: scheduler_warmup.step()  # bug workaround

        if auc > auc_max:
            print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                auc_max, auc))
            torch.save(model.state_dict(), model_file)
            auc_max = auc
        if auc_20 > auc_20_max:
            print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                auc_20_max, auc_20))
            torch.save(model.state_dict(), model_file2)
            auc_20_max = auc_20

    torch.save(model.state_dict(), model_file3)
Exemplo n.º 3
0
def run(fold, df, meta_features, n_meta_features, transforms_train,
        transforms_val, mel_idx):
    # en suivant le méthode k fold :
    if args.DEBUG:
        args.n_epochs = 5
        # la validation se fait la paquet de données dont l'id est fold
        # le reste des paquets on l'utilise pour le training
        df_train = df[df['fold'] != fold].sample(args.batch_size * 5)
        df_valid = df[df['fold'] == fold].sample(args.batch_size * 5)
    else:
        df_train = df[df['fold'] != fold]
        df_valid = df[df['fold'] == fold]

    # on instantie nous objet dataset (Training + Validation)
    dataset_train = MelanomaDataset(df_train,
                                    'train',
                                    meta_features,
                                    transform=transforms_train)
    dataset_valid = MelanomaDataset(df_valid,
                                    'valid',
                                    meta_features,
                                    transform=transforms_val)
    # on instantie nous data loader (training validation )
    train_loader = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=args.batch_size,
        sampler=RandomSampler(dataset_train),
        num_workers=args.num_workers)
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers)

    # on instantie notre model
    model = ModelClass(
        args.enet_type,  # ex : Resnet 
        n_meta_features=
        n_meta_features,  # ex ['sex', 'age_approx', 'n_images', 'image_size']
        n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')],
        out_dim=args.out_dim,
        pretrained=True)
    if DP:
        model = apex.parallel.convert_syncbn_model(model)
    model = model.to(device)

    # on instantie nous variables de précisions
    auc_max = 0.
    auc_20_max = 0.
    # on définie les fichiers dans les quels on stocke les paramètres modèles
    model_file = os.path.join(args.model_dir,
                              f'{args.kernel_type}_best_fold{fold}.pth')
    model_file2 = os.path.join(args.model_dir,
                               f'{args.kernel_type}_best_20_fold{fold}.pth')
    model_file3 = os.path.join(args.model_dir,
                               f'{args.kernel_type}_final_fold{fold}.pth')

    optimizer = optim.Adam(model.parameters(), lr=args.init_lr)
    if args.use_amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    if DP:
        model = nn.DataParallel(model)


#     scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs - 1)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, args.n_epochs - 1)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=1,
        after_scheduler=scheduler_cosine)

    print(len(dataset_train), len(dataset_valid))

    for epoch in range(1, args.n_epochs + 1):
        print(time.ctime(), f'Fold {fold}, Epoch {epoch}')
        #         scheduler_warmup.step(epoch - 1)

        # train loss
        train_loss = train_epoch(model, train_loader, optimizer)
        # validation loss
        val_loss, acc, auc, auc_20 = val_epoch(
            model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values)

        content = time.ctime(
        ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.'
        print(content)
        with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'),
                  'a') as appender:
            appender.write(content + '\n')

        scheduler_warmup.step()
        if epoch == 2: scheduler_warmup.step()  # bug workaround

        # on stocke les paramètres model dans les fichiers correspondants
        if auc > auc_max:
            print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                auc_max, auc))
            torch.save(model.state_dict(), model_file)
            auc_max = auc
        if auc_20 > auc_20_max:
            print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                auc_20_max, auc_20))
            torch.save(model.state_dict(), model_file2)
            auc_20_max = auc_20
    # on stocke les paramètres model dont la précision maximale dans le fichier  model_file3
    torch.save(model.state_dict(), model_file3)
def main():

    # get dataframe
    df, out_dim = get_df(args.kernel_type, args.data_dir, args.train_step)
    print(f"out_dim = {out_dim}")

    # get adaptive margin
    tmp = np.sqrt(
        1 / np.sqrt(df['landmark_id'].value_counts().sort_index().values))
    margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * 0.45 + 0.05

    # get augmentations
    transforms_train, transforms_val = get_transforms(args.image_size)

    # get train and valid dataset
    df_train = df[df['fold'] != args.fold]
    df_valid = df[df['fold'] == args.fold].reset_index(
        drop=True).query("index % 15==0")

    dataset_train = LandmarkDataset(df_train,
                                    'train',
                                    'train',
                                    transform=transforms_train)
    dataset_valid = LandmarkDataset(df_valid,
                                    'train',
                                    'val',
                                    transform=transforms_val)
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers)

    # model
    model = ModelClass(args.enet_type, out_dim=out_dim)
    model = model.cuda()
    model = apex.parallel.convert_syncbn_model(model)

    # loss func
    def criterion(logits_m, target):
        arc = ArcFaceLossAdaptiveMargin(margins=margins, s=80)
        loss_m = arc(logits_m, target, out_dim)
        return loss_m

    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.init_lr)
    if args.use_amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # load pretrained
    if len(args.load_from) > 0:
        checkpoint = torch.load(args.load_from,
                                map_location='cuda:{}'.format(args.local_rank))
        state_dict = checkpoint['model_state_dict']
        state_dict = {
            k[7:] if k.startswith('module.') else k: state_dict[k]
            for k in state_dict.keys()
        }
        if args.train_step == 1:
            del state_dict['metric_classify.weight']
            model.load_state_dict(state_dict, strict=False)
        else:
            model.load_state_dict(state_dict, strict=True)


#             if 'optimizer_state_dict' in checkpoint:
#                 optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        del checkpoint, state_dict
        torch.cuda.empty_cache()
        import gc
        gc.collect()

    model = DistributedDataParallel(model, delay_allreduce=True)

    # lr scheduler
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, args.n_epochs - 1)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=1,
        after_scheduler=scheduler_cosine)

    # train & valid loop
    gap_m_max = 0.
    model_file = os.path.join(args.model_dir,
                              f'{args.kernel_type}_fold{args.fold}.pth')
    for epoch in range(args.start_from_epoch, args.n_epochs + 1):

        print(time.ctime(), 'Epoch:', epoch)
        scheduler_warmup.step(epoch - 1)

        train_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset_train)
        train_sampler.set_epoch(epoch)

        train_loader = torch.utils.data.DataLoader(
            dataset_train,
            batch_size=args.batch_size,
            num_workers=args.num_workers,
            shuffle=train_sampler is None,
            sampler=train_sampler,
            drop_last=True)

        train_loss = train_epoch(model, train_loader, optimizer, criterion)
        val_loss, acc_m, gap_m = val_epoch(model, valid_loader, criterion)

        if args.local_rank == 0:
            content = time.ctime(
            ) + ' ' + f'Fold {args.fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, valid loss: {(val_loss):.5f}, acc_m: {(acc_m):.6f}, gap_m: {(gap_m):.6f}.'
            print(content)
            with open(os.path.join(args.log_dir, f'{args.kernel_type}.txt'),
                      'a') as appender:
                appender.write(content + '\n')

            print('gap_m_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                gap_m_max, gap_m))
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                }, model_file)
            gap_m_max = gap_m

        if epoch == args.stop_at_epoch:
            print(time.ctime(), 'Training Finished!')
            break

    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        },
        os.path.join(args.model_dir,
                     f'{args.kernel_type}_fold{args.fold}_final.pth'))
Exemplo n.º 5
0
def run(fold, df, meta_features, n_meta_features, transforms_train, transforms_val, mel_idx):
    if args.DEBUG:
        args.n_epochs = 5
        df_train = df[df['fold'] != fold].sample(args.batch_size * 5)
        df_valid = df[df['fold'] == fold].sample(args.batch_size * 5)
    else:
        df_train = df[df['fold'] != fold]
        df_valid = df[df['fold'] == fold]

    dataset_train = MelanomaDataset(df_train, 'train', meta_features, transform=transforms_train)
    dataset_valid = MelanomaDataset(df_valid, 'valid', meta_features, transform=transforms_val)
    train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size,
                                               sampler=RandomSampler(dataset_train), num_workers=args.num_workers)
    valid_loader = torch.utils.data.DataLoader(dataset_valid, batch_size=args.batch_size, num_workers=args.num_workers)

    model = ModelClass()
    model = model.to(device)

    auc_max = 0.
    auc_20_max = 0.
    model_file = os.path.join(args.model_dir, f'{args.kernel_type}_best_fold{fold}.pth')
    model_file2 = os.path.join(args.model_dir, f'{args.kernel_type}_best_20_fold{fold}.pth')
    model_file3 = os.path.join(args.model_dir, f'{args.kernel_type}_final_fold{fold}.pth')

    optimizer = optim.AdamW(model.parameters(), lr=args.init_lr,weight_decay=args.weight_decay)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, args.n_epochs - 1)
    scheduler_warmup = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=1,
                                                after_scheduler=scheduler_cosine)

    print(len(dataset_train), len(dataset_valid))

    print('Continuing with model from ' + model_file3)
    try:
        checkpoint = torch.load(model_file)
        model.load_state_dict(checkpoint,strict=False)
    except:
        print('error')
        pass

    for epoch in range(1, args.n_epochs + 1):
        print(time.ctime(), f'Fold {fold}, Epoch {epoch}')
        #         scheduler_warmup.step(epoch - 1)

        train_loss = train_epoch(model, train_loader, optimizer)
        val_loss, acc, auc, auc_20 = val_epoch(model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values)

        content = time.ctime() + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.'
        print(content)
        with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'), 'a') as appender:
            appender.write(content + '\n')

        scheduler_warmup.step()
        if epoch == 2: scheduler_warmup.step()  # bug workaround

        if auc > auc_max:
            print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format(auc_max, auc))
            torch.save(model.state_dict(), model_file)
            auc_max = auc
        if auc_20 > auc_20_max:
            print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format(auc_20_max, auc_20))
            torch.save(model.state_dict(), model_file2)
            auc_20_max = auc_20
        torch.save({
            'net': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, model_file3)
Exemplo n.º 6
0
def run(folds, df, transforms_train, transforms_val):
    if args.DEBUG:
        args.n_epochs = 3
        df_train = df[df['fold'].isin(folds)].sample(args.batch_size * 4)
        df_valid = df[~df['fold'].isin(folds)].sample(args.batch_size * 4)
    else:
        df_train = df[df['fold'].isin(folds)]
        df_valid = df[~df['fold'].isin(folds)]

    dataset_train = RetinalDataset(df_train,
                                   'train',
                                   transform=transforms_train)
    dataset_valid = RetinalDataset(df_valid, 'valid', transform=transforms_val)
    train_loader = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=args.batch_size,
        sampler=RandomSampler(dataset_train),
        num_workers=args.num_workers)
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers)
    model = ModelClass(args.enet_type,
                       out_dim=args.out_dim,
                       pretrained=True,
                       freeze_cnn=args.freeze_cnn,
                       load_model=args.load_model,
                       pretrain_cnn=args.pretrain_cnn,
                       pretrain_file=args.pretrain_file)
    para_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    content = f'Number of trainable parameters:{para_num}\n'
    if DP:
        pass
    model = model.to(device)

    score_max = 0.
    if args.DEBUG:
        model_file_best = os.path.join(args.model_dir + '/debug',
                                       f'{args.kernel_type}_best.pth')
        model_file_final = os.path.join(args.model_dir + '/debug',
                                        f'{args.kernel_type}_final.pth')
    else:
        model_file_best = os.path.join(args.model_dir,
                                       f'{args.kernel_type}_best.pth')
        model_file_final = os.path.join(args.model_dir,
                                        f'{args.kernel_type}_final.pth')

    if args.freeze_cnn:
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.init_lr)
    else:
        optimizer = optim.Adam(model.parameters(), lr=args.init_lr)
    if DP:
        pass
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, args.n_epochs - 1)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=1,
        after_scheduler=scheduler_cosine)
    nums = dataset_train.get_num()
    content += f'total num of train:{len(dataset_train)},class nums:{nums}' + '\n'
    nums = dataset_valid.get_num()
    content += f'total num of val:{len(dataset_valid)},class nums:{nums}' + '\n'
    if args.DEBUG:
        with open(
                os.path.join(args.log_dir + '/debug',
                             f'log_{args.kernel_type}.txt'), 'a') as appender:
            appender.write(content)
    else:
        with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'),
                  'a') as appender:
            appender.write(content)
    for epoch in range(1, args.n_epochs + 1):
        print(time.ctime(), f'Epoch {epoch}')

        train_loss = train_epoch(model, train_loader, optimizer)
        val_loss, mean_score, scores = val_epoch(model, valid_loader)

        content = time.ctime(
        ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {val_loss:.5f}, mean_score: {mean_score:.4f}, scores: {scores[0]:.4f} {scores[1]:.4f} {scores[2]:.4f} {scores[3]:.4f} {scores[4]:.4f} {scores[5]:.4f} {scores[6]:.4f}.'
        print(content)
        if args.DEBUG:
            with open(
                    os.path.join(args.log_dir + '/debug',
                                 f'log_{args.kernel_type}.txt'),
                    'a') as appender:
                appender.write(content + '\n')
        else:
            with open(
                    os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'),
                    'a') as appender:
                appender.write(content + '\n')
        scheduler_warmup.step()
        if epoch == 2: scheduler_warmup.step()  # bug workaround

        if mean_score > score_max:
            print('score_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                score_max, mean_score))
            torch.save(model.state_dict(), model_file_best)
            score_max = mean_score

    torch.save(model.state_dict(), model_file_final)
Exemplo n.º 7
0
def main(args):

    # get dataframe
    df = get_df(args.groups)

    # get adaptive margin
    tmp = np.sqrt(
        1 / np.sqrt(df['label_group'].value_counts().sort_index().values))
    margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * 0.45 + 0.05

    # get augmentations
    transforms_train, transforms_val = get_transforms(args.image_size,
                                                      args.stage)

    # get train and valid dataset
    df_train = df[df['fold'] != args.fold] if not args.full else df
    df_train['label_group'] = LabelEncoder().fit_transform(
        df_train.label_group)

    df_valid = df[df['fold'] == args.fold]

    out_dim = df_train.label_group.nunique()
    print(f"out_dim = {out_dim}")

    dataset_train = ShoppeDataset(df_train,
                                  'train',
                                  transform=transforms_train)
    dataset_valid = ShoppeDataset(df_valid, 'val', transform=transforms_val)

    print(
        f'Train on {len(df_train)} images, validate on {len(df_valid)} images')

    train_loader = torch.utils.data.DataLoader(dataset_train,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers,
                                               pin_memory=True,
                                               shuffle=True,
                                               drop_last=True)
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers)

    loss_config = decode_config(args.loss_config)
    # model
    if args.enet_type == 'resnest50':
        model = Resnest50(out_dim=out_dim, loss_config=loss_config, args=args)
    else:
        model = Model(args.enet_type,
                      out_dim=out_dim,
                      loss_config=loss_config,
                      args=args)
    model = model.cuda()

    # loss func
    criterion = get_criterion(args, out_dim, margins)

    # optimizer
    optimizer = optim.AdamW(model.parameters(), lr=args.init_lr)

    # load pretrained
    if args.load_from and args.load_from != 'none':
        checkpoint = torch.load(args.load_from, map_location='cuda:0')
        state_dict = checkpoint['model_state_dict']
        state_dict = {
            k[7:] if k.startswith('module.') else k: state_dict[k]
            for k in state_dict.keys()
        }
        model.load_state_dict(state_dict, strict=True)
        del checkpoint, state_dict
        torch.cuda.empty_cache()
        gc.collect()
        print(f"Loaded weight from {args.load_from}")

    # lr scheduler
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, args.n_epochs - 1)
    warmup_epochs = args.warmup_epochs if args.stage == 1 else 1
    print(warmup_epochs)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=warmup_epochs,
        after_scheduler=scheduler_cosine)

    # train & valid loop
    best_score = -1
    model_file = os.path.join(
        args.model_dir,
        weight_file(args.kernel_type, args.fold, args.stage,
                    loss_config.loss_type, out_dim))
    for epoch in range(args.start_from_epoch, args.n_epochs + 1):

        print(time.ctime(), f'Epoch: {epoch}/{args.n_epochs}')
        scheduler_warmup.step(epoch - 1)

        train_loss, acc_list = train_epoch(model, train_loader, optimizer,
                                           criterion)
        f1score = val_epoch(model, valid_loader, criterion, df_valid, args)

        content = time.ctime() + ' ' + \
            (
                f'Fold {args.fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f},'
                f' train acc {np.mean(acc_list):.5f}, f1score: {(f1score):.6f}.')

        print(content)
        with open(os.path.join(args.log_dir, f'{args.kernel_type}.txt'),
                  'a') as appender:
            appender.write(content + '\n')

        if f1score > best_score:
            print('best f1 score ({:.6f} --> {:.6f}). Saving model ...'.format(
                best_score, f1score))
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                }, model_file)
            best_score = f1score

        if epoch == args.stop_at_epoch:
            print(time.ctime(), 'Training Finished!')
            break

    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, model_file)