예제 #1
0
def run(config):
    train_dir = config.train.dir

    model = get_model(config).cuda()
    criterion = get_loss(config)
    optimizer = get_optimizer(config, model.parameters())

    checkpoint = utils.checkpoint.get_initial_checkpoint(config)
    if checkpoint is not None:
        last_epoch, step = utils.checkpoint.load_checkpoint(
            model, optimizer, checkpoint)
    else:
        last_epoch, step = -1, -1

    print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch))
    scheduler = get_scheduler(config, optimizer, last_epoch)

    #     dataloaders = {split:get_dataloader(config, split, get_transform(config, split))
    #                    for split in ['train', 'val']}

    print(config.data)
    dataloaders = {
        'train': get_train_dataloader(config, get_transform(config)),
        'val': get_valid_dataloaders(config)[0]
    }
    writer = SummaryWriter(train_dir)
    train(config, model, dataloaders, criterion, optimizer, scheduler, writer,
          last_epoch + 1)
예제 #2
0
def run(config, folds_dir, balanced):
    model = get_model(config[MODEL_NAME], config[MODEL_PARAMS]).cuda()
    criterion = get_loss(config[LOSS_NAME], config[LOSS_PARAMS])
    optimizer = get_optimizer(config[OPTIM_NAME],
                              model.parameters(),
                              optimizer_params=config[OPTIM_PARAMS])

    last_epoch = -1
    scheduler = get_scheduler(config[SCHEDULER_NAME], optimizer, last_epoch,
                              config[SCHEDULER_PARAMS])

    datasets = {
        stage: CustomDataset(folds_dir, stage, config[FOLD_ID],
                             config[DATA_PREFIX], config[INPUT_SIZE])
        for stage in ['train', 'test']
    }

    print('Loading sampler')
    if balanced:
        train_sampler = BalancedBatchSampler(datasets['train'])
    else:
        train_sampler = None
    print('Sampler loaded')
    dataloaders = {
        stage: get_dataloader(datasets[stage], config[BATCH_SIZE],
                              train_sampler)
        for stage in ['train', 'test']
    }

    writer = SummaryWriter(config[TRAIN_DIR])
    train(config, model, dataloaders, criterion, optimizer, scheduler, writer,
          last_epoch + 1)
예제 #3
0
def run(config_file):
    config = load_config(config_file)

    os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(
            data_folder=config.data.train_dir,
            df_path=config.data.train_df_path,
            phase=phase,
            batch_size=config.train.batch_size,
            num_workers=config.num_workers,
            idx_fold=config.data.params.idx_fold,
            transforms=all_transforms[phase],
            num_classes=config.data.num_classes,
            pseudo_label_path=config.train.pseudo_label_path,
            task='cls'
        )
        for phase in ['train', 'valid']
    }

    # create model
    model = CustomNet(config.model.encoder, config.data.num_classes)

    # train setting
    criterion = get_loss(config)
    params = [
        {'params': model.base_params(), 'lr': config.optimizer.params.encoder_lr},
        {'params': model.fresh_params(), 'lr': config.optimizer.params.decoder_lr}
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model)

    callbacks = [MultiClassAccuracyCallback(threshold=0.5), F1ScoreCallback()]
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth'))

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        callbacks=callbacks,
        verbose=True,
        fp16=True,
    )
예제 #4
0
def run(config):
    model = get_model(config[MODEL_NAME], config[MODEL_PARAMS]).cuda()
    criterion = get_loss(config[LOSS_NAME], config[LOSS_PARAMS])
    optimizer = get_optimizer(config[OPTIM_NAME],
                              model.parameters(),
                              optimizer_params=config[OPTIM_PARAMS])

    last_epoch = -1
    scheduler = get_scheduler(config[SCHEDULER_NAME], optimizer, last_epoch,
                              config[SCHEDULER_PARAMS])

    datasets = {
        stage: CustomDataset(DATA_DIR, stage, config[FOLD_ID],
                             config[DATA_PREFIX], config[INPUT_SIZE])
        for stage in ['train', 'test']
    }

    dataloaders = {
        stage: get_dataloader(datasets[stage], config[BATCH_SIZE])
        for stage in ['train', 'test']
    }

    writer = SummaryWriter(config[TRAIN_DIR])
    clip_grad_value_(model.parameters(), 2.0)
    train(config, model, dataloaders, criterion, optimizer, scheduler, writer,
          last_epoch + 1)
예제 #5
0
    def __init__(self, args, logger):
        self.args = args
        self.logger = logger
        self.writer = SummaryWriter(args.log_dir)
        cudnn.enabled = True

        # set up model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = get_aux_net(args.network.arch)(aux_classes=args.aux_classes + 1, classes=args.n_classes)
        self.model = self.model.to(self.device)
        wandb.watch(self.model)

        if args.mode == 'train':
            # set up optimizer, lr scheduler and loss functions
            optimizer = get_optimizer(self.args.training.optimizer)
            optimizer_params = {k: v for k, v in self.args.training.optimizer.items() if k != "name"}
            self.optimizer = optimizer(self.model.parameters(), **optimizer_params)
            self.scheduler = get_scheduler(self.optimizer, self.args.training.lr_scheduler)

            self.class_loss_func = nn.CrossEntropyLoss()

            self.start_iter = 0

            # resume
            if args.training.resume:
                self.load(args.model_dir + '/' + args.training.resume)

            cudnn.benchmark = True

        elif args.mode == 'val':
            self.load(os.path.join(args.model_dir, args.validation.model))
        else:
            self.load(os.path.join(args.model_dir, args.testing.model))
예제 #6
0
def run(config):
    train_dir = config.train.dir
    model = get_model(config, model_type).to(device)
    print('The nubmer of parameters : %d' % count_parameters(model))
    criterion = get_loss(config)
    optimizer = get_optimizer(config, model)

    checkpoint = utils.checkpoint.get_initial_checkpoint(config,
                                                         model_type=model_type)
    if checkpoint is not None:
        last_epoch, step = utils.checkpoint.load_checkpoint(
            model, optimizer, checkpoint, model_type=model_type)
    else:
        last_epoch, step = -1, -1

    print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch))
    scheduler = get_scheduler(config, optimizer, last_epoch)

    print(config.data)
    dataloaders = {
        'train': get_train_dataloader(config),
        'val': get_valid_dataloader(config),
        'test': get_test_dataloader(config)
    }

    writer = SummaryWriter(config.train[model_type + '_dir'])
    visualizer = get_visualizer(config)
    train(config, model, dataloaders, criterion, optimizer, scheduler, writer,
          visualizer, last_epoch + 1)
예제 #7
0
    def run(self):
        # checkpoint
        self.scheduler = get_scheduler(self.config, self.optimizer,
                                       self.last_epoch)
        self.model.train()
        postfix_dic = {
            'lr': 0.0,
            'acc': 0.0,
            'loss': 0.0,
        }

        if self.config.data.sampler == "weight":
            self.train_weigh()
        else:
            for epoch in range(self.last_epoch, self.num_epochs):

                self.train_single_epoch(epoch)

                if epoch % 200 == 199:
                    save_checkpoint(self.config, self.model, self.optimizer,
                                    self.optimizer_center, epoch, self.step)

                self.scheduler.step()
                if epoch > self.config.train.num_epochs:
                    break
def run(config):
    teacher_model = get_model(config, 'teacher').to(device)
    criterion = get_loss(config)

    # for teacher
    trainable_params = filter(lambda p: p.requires_grad,
                              teacher_model.parameters())
    optimizer_t = get_optimizer(config, teacher_model.parameters())
    checkpoint_t = utils.checkpoint.get_initial_checkpoint(
        config, model_type='teacher')
    if checkpoint_t is not None:
        last_epoch_t, step_t = utils.checkpoint.load_checkpoint(
            teacher_model, optimizer_t, checkpoint_t, model_type='teacher')
    else:
        last_epoch_t, step_t = -1, -1
    print('teacher model from checkpoint: {} last epoch:{}'.format(
        checkpoint_t, last_epoch_t))

    scheduler_t = get_scheduler(config, optimizer_t, last_epoch_t)

    print(config.data)
    dataloaders = {
        'train': get_train_dataloader(config),
        'val': get_valid_dataloader(config),
        'test': get_test_dataloader(config)
    }
    writer = SummaryWriter(config.train['teacher' + '_dir'])
    visualizer = get_visualizer(config)
    train(config, teacher_model, dataloaders, criterion, optimizer_t,
          scheduler_t, writer, visualizer, last_epoch_t + 1)
예제 #9
0
def run(config):
    train_dir = config.train.dir

    task = get_task(config)
    optimizer = get_optimizer(config, task.get_model().parameters())

    checkpoint = utils.checkpoint.get_initial_checkpoint(config)
    if checkpoint is not None:
        last_epoch, step = utils.checkpoint.load_checkpoint(
            task.get_model(), optimizer, checkpoint)
    else:
        last_epoch, step = -1, -1

    print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch))
    scheduler = get_scheduler(config, optimizer, last_epoch)

    preprocess_opt = task.get_preprocess_opt()
    dataloaders = {
        split: get_dataloader(config, split,
                              get_transform(config, split, **preprocess_opt))
        for split in ['train', 'dev']
    }

    writer = SummaryWriter(config.train.dir)
    train(config, task, dataloaders, optimizer, scheduler, writer,
          last_epoch + 1)
def run(config):
    train_dir = config.train.dir

    student_model = get_model(config, model_type).to(device)
    criterion = get_loss(config)
    trainable_params = filter(lambda p: p.requires_grad,
                              student_model.parameters())
    optimizer = get_optimizer(config, trainable_params)
    checkpoint = utils.checkpoint.get_initial_checkpoint(config,
                                                         model_type=model_type)
    if checkpoint is not None:
        last_epoch, step = utils.checkpoint.load_checkpoint(
            student_model, optimizer, checkpoint, model_type=model_type)
    else:
        last_epoch, step = -1, -1
    print('student model from checkpoint: {} last epoch:{}'.format(
        checkpoint, last_epoch))

    scheduler = get_scheduler(config, optimizer, last_epoch)

    print(config.data)
    dataloaders = {
        'train': get_train_dataloader(config),
        'val': get_valid_dataloader(config),
        'test': get_test_dataloader(config)
    }
    writer = SummaryWriter(config.train['student' + '_dir'])
    visualizer = get_visualizer(config)
    train(config, student_model, dataloaders, criterion, optimizer, scheduler,
          writer, visualizer, last_epoch + 1)
예제 #11
0
파일: runner.py 프로젝트: 18445864529/s3prl
 def _get_scheduler(self, optimizer):
     scheduler = get_scheduler(
         optimizer,
         self.config['runner']['total_steps'],
         self.config['scheduler']
     )
     self._load_weight(scheduler, 'Scheduler')
     return scheduler
예제 #12
0
파일: trainer.py 프로젝트: tkd26/RCM
    def __init__(self, config):
        """Initialize Trainer

        Args:
            config (dict): Configuration dictionary
        """
        super(Trainer, self).__init__()

        # Define multi-task setting
        dataset = config['dataset']
        dataset_name = dataset['dataset_name']
        self.tasks_weighting = dataset['tasks_weighting']
        self.tasks = [k for k, v in self.tasks_weighting.items()]

        # Setup network
        model_config = config['model']
        self.model = get_module(model_config, dataset_name, self.tasks)
        print('Model constructed for {}'.format(' '.join(self.tasks)))

        if 'grouping' in model_config:
            print('groups = {}'.format(model_config['grouping']['groups']))
            print('grouping method = {}'.format(model_config['grouping']['method']))
            self.model = update_module(config, self.model, self.tasks)

        # Setup for a task-conditional setting
        model_params = config['model']['parameters']
        if 'common_mt_params' in model_params:
            self.task_conditional = not model_params['common_mt_params']
        else:
            self.task_conditional = False

        # Setup optimizers
        optimizer_config = config['optimizer']
        optimizer_cls = get_optimizer(optimizer_config['algorithm'])
        model_params = get_params(self.model, optimizer_config['parameters']['lr'], len(self.tasks),
                                  self.task_conditional, self.tasks)
        self.optimizer = optimizer_cls(model_params, **optimizer_config['parameters'])

        # Setup schedulers
        scheduler_config = config['scheduler']
        scheduler_cls = get_scheduler(scheduler_config['lr_policy'])
        self.scheduler = scheduler_cls(self.optimizer, **scheduler_config['parameters'])

        # Setup loss function
        losses_config = config['loss']
        self.criterions = get_loss_functions(self.tasks, losses_config)

        # Initialise performance meters
        self.best_val_loss = 1e9
        self.train_loss = {}
        self.val_loss = {}
        for task in self.tasks:
            self.train_loss[task] = get_running_meter()
            self.val_loss[task] = get_running_meter()

        # Initialize img logging for visualization
        self.img_logging = get_img_logging(dataset_name, self.tasks)
        self.pred_decoder = get_pred_decoder(dataset_name, self.tasks)
예제 #13
0
파일: runner.py 프로젝트: ftshijt/s3prl
    def _get_scheduler(self, optimizer):
        scheduler = get_scheduler(optimizer,
                                  self.config['runner']['total_steps'],
                                  self.config['scheduler'])

        init_scheduler = self.init_ckpt.get('Scheduler')
        if init_scheduler:
            print(
                '[Runner] - Loading scheduler weights from the previous experiment'
            )
            scheduler.load_state_dict(init_scheduler)
        return scheduler
예제 #14
0
def search_once(config, policy):
    model = get_model(config).cuda()
    criterion = get_loss(config)
    optimizer = get_optimizer(config, model.parameters())
    scheduler = get_scheduler(config, optimizer, -1)

    transforms = {'train': get_transform(config, 'train', params={'policies': policy}),
                  'val': get_transform(config, 'val')}
    dataloaders = {split:get_dataloader(config, split, transforms[split])
                   for split in ['train', 'val']}

    score_dict = train(config, model, dataloaders, criterion, optimizer, scheduler, None, 0)
    return score_dict['f1_mavg']
예제 #15
0
def run(config):
    teacher_model = get_model(config, 'teacher').to(device)
    student_model = get_model(config, 'student').to(device)
    print('The nubmer of parameters : %d'%count_parameters(student_model))
    criterion = get_loss(config)


    # for teacher
    optimizer_t = None
    checkpoint_t = utils.checkpoint.get_initial_checkpoint(config,
                                                         model_type='teacher')
    if checkpoint_t is not None:
        last_epoch_t, step_t = utils.checkpoint.load_checkpoint(teacher_model,
                                 optimizer_t, checkpoint_t, model_type='teacher')
    else:
        last_epoch_t, step_t = -1, -1
    print('teacher model from checkpoint: {} last epoch:{}'.format(
        checkpoint_t, last_epoch_t))

    # for student
    optimizer_s = get_optimizer(config, student_model)
    checkpoint_s = utils.checkpoint.get_initial_checkpoint(config,
                                                         model_type='student')
    if checkpoint_s is not None:
        last_epoch_s, step_s = utils.checkpoint.load_checkpoint(student_model,
                                 optimizer_s, checkpoint_s, model_type='student')
    else:
        last_epoch_s, step_s = -1, -1
    print('student model from checkpoint: {} last epoch:{}'.format(
        checkpoint_s, last_epoch_s))

    scheduler_s = get_scheduler(config, optimizer_s, last_epoch_s)

    print(config.data)
    dataloaders = {'train':get_train_dataloader(config, get_transform(config)),
                   'val':get_valid_dataloader(config)}
                   #'test':get_test_dataloader(config)}
    writer = SummaryWriter(config.train['student' + '_dir'])
    visualizer = get_visualizer(config)
    result = train(config, student_model, teacher_model, dataloaders,
          criterion, optimizer_s, scheduler_s, writer,
          visualizer, last_epoch_s+1)
    
    print('best psnr : %.3f, best epoch: %d'%(result['best_psnr'], result['best_epoch']))
예제 #16
0
파일: train.py 프로젝트: rosaann/landmark
def run(config):
    train_group_csv_dir = './data/group_csv/'
    writer = SummaryWriter(config.train.dir)
    train_filenames = list(glob.glob(os.path.join(train_group_csv_dir, 'data_train_group_*')))[1:]
    
    for ti, train_file in tqdm.tqdm(enumerate(train_filenames)):
        gi_tr = train_file.replace('data_train_group_', '')
        gi_tr = gi_tr.split('/')[-1]
        gi_tr = gi_tr.replace('.csv', '')
        group_idx = int(gi_tr)
        
        utils.prepare_train_directories(config, group_idx)
        
        model = get_model(config, group_idx)
        if torch.cuda.is_available():
            model = model.cuda()
        criterion = get_loss(config)
        optimizer = get_optimizer(config, model.parameters())
        
    

        checkpoint = utils.checkpoint.get_initial_checkpoint(config, group_idx)
        if checkpoint is not None:
            last_epoch, step = utils.checkpoint.load_checkpoint(model, optimizer, checkpoint)
        else:
            last_epoch, step = -1, -1

        if last_epoch > config.train.num_epochs:
            print('group -- ', str(group_idx), '-- index-', ti, '  ----已xl,跳过')
            continue
        print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch))
        print('group -- ', str(group_idx), '-- index-', ti)
        scheduler = get_scheduler(config, optimizer, last_epoch)
    
        dataloaders = {split:get_dataloader(config, group_idx, split, get_transform(config, split))
                   for split in ['train', 'val']}
    

    
        train(config,group_idx, model, dataloaders, criterion, optimizer, scheduler,
          writer, last_epoch+1)
예제 #17
0
    def __init__(self, cfg, writer, logger, use_pseudo_label=False, modal_num=3, multimodal_merger=multimodal_merger):
        self.cfg = cfg
        self.writer = writer
        self.class_numbers = 19
        self.logger = logger
        cfg_model = cfg['model']
        self.cfg_model = cfg_model
        self.best_iou = -100
        self.iter = 0
        self.nets = []
        self.split_gpu = 0
        self.default_gpu = cfg['model']['default_gpu']
        self.PredNet_Dir = None
        self.valid_classes = cfg['training']['valid_classes']
        self.G_train = True
        self.cls_feature_weight = cfg['training']['cls_feature_weight']
        self.use_pseudo_label = use_pseudo_label
        self.modal_num = modal_num

        # cluster vectors & cuda initialization
        self.objective_vectors_group = torch.zeros(self.modal_num + 1, 19, 256).cuda()
        self.objective_vectors_num_group = torch.zeros(self.modal_num + 1, 19).cuda()
        self.objective_vectors_dis_group = torch.zeros(self.modal_num + 1, 19, 19).cuda()
        self.class_threshold_group = torch.full([self.modal_num + 1, 19], 0.6).cuda()

        self.disc_T = torch.FloatTensor([0.0]).cuda()

        #self.metrics = CustomMetrics(self.class_numbers)
        self.metrics = CustomMetrics(self.class_numbers, modal_num=self.modal_num, model=self)

        # multimodal / multi-branch merger
        self.multimodal_merger = multimodal_merger

        bn = cfg_model['bn']
        if bn == 'sync_bn':
            BatchNorm = SynchronizedBatchNorm2d
        elif bn == 'bn':
            BatchNorm = nn.BatchNorm2d
        elif bn == 'gn':
            BatchNorm = nn.GroupNorm
        else:
            raise NotImplementedError('batch norm choice {} is not implemented'.format(bn))

        if True:
            self.PredNet = DeepLab(
                    num_classes=19,
                    backbone=cfg_model['basenet']['version'],
                    output_stride=16,
                    bn=cfg_model['bn'],
                    freeze_bn=True,
                    modal_num=self.modal_num
                    ).cuda()
            self.load_PredNet(cfg, writer, logger, dir=None, net=self.PredNet)
            self.PredNet_DP = self.init_device(self.PredNet, gpu_id=self.default_gpu, whether_DP=True) 
            self.PredNet.eval()
            self.PredNet_num = 0

            self.PredDnet = FCDiscriminator(inplanes=19)
            self.load_PredDnet(cfg, writer, logger, dir=None, net=self.PredDnet)
            self.PredDnet_DP = self.init_device(self.PredDnet, gpu_id=self.default_gpu, whether_DP=True)
            self.PredDnet.eval()

        self.BaseNet = DeepLab(
                            num_classes=19,
                            backbone=cfg_model['basenet']['version'],
                            output_stride=16,
                            bn=cfg_model['bn'],
                            freeze_bn=True, 
                            modal_num=self.modal_num
                            )

        logger.info('the backbone is {}'.format(cfg_model['basenet']['version']))

        self.BaseNet_DP = self.init_device(self.BaseNet, gpu_id=self.default_gpu, whether_DP=True)
        self.nets.extend([self.BaseNet])
        self.nets_DP = [self.BaseNet_DP]

        # Discriminator
        self.SOURCE_LABEL = 0
        self.TARGET_LABEL = 1
        self.DNets = []
        self.DNets_DP = []
        for _ in range(self.modal_num+1):
            _net_d = FCDiscriminator(inplanes=19)
            self.DNets.append(_net_d)
            _net_d_DP = self.init_device(_net_d, gpu_id=self.default_gpu, whether_DP=True)
            self.DNets_DP.append(_net_d_DP)

        self.nets.extend(self.DNets)
        self.nets_DP.extend(self.DNets_DP)

        self.optimizers = []
        self.schedulers = []        

        optimizer_cls = torch.optim.SGD
        optimizer_params = {k:v for k, v in cfg['training']['optimizer'].items() 
                            if k != 'name'}

        optimizer_cls_D = torch.optim.Adam
        optimizer_params_D = {k:v for k, v in cfg['training']['optimizer_D'].items() 
                            if k != 'name'}

        if False:
            self.BaseOpti = optimizer_cls(self.BaseNet.parameters(), **optimizer_params)
        else:
            self.BaseOpti = optimizer_cls(self.BaseNet.optim_parameters(cfg['training']['optimizer']['lr']), **optimizer_params)

        self.optimizers.extend([self.BaseOpti])

        self.DiscOptis = []
        for _d_net in self.DNets: 
            self.DiscOptis.append(
                optimizer_cls_D(_d_net.parameters(), **optimizer_params_D)
            )
        self.optimizers.extend(self.DiscOptis)

        self.schedulers = []        

        if False:
            self.BaseSchedule = get_scheduler(self.BaseOpti, cfg['training']['lr_schedule'])
            self.schedulers.extend([self.BaseSchedule])
        else:
            """BaseSchedule detail see FUNC: scheduler_step()"""
            self.learning_rate = cfg['training']['optimizer']['lr']
            self.gamma = cfg['training']['lr_schedule']['gamma']
            self.num_steps = cfg['training']['lr_schedule']['max_iter']
            self._BaseSchedule_nouse = get_scheduler(self.BaseOpti, cfg['training']['lr_schedule'])
            self.schedulers.extend([self._BaseSchedule_nouse])

        self.DiscSchedules = []
        for _disc_opt in self.DiscOptis:
            self.DiscSchedules.append(
                get_scheduler(_disc_opt, cfg['training']['lr_schedule'])
            )
        self.schedulers.extend(self.DiscSchedules)
        self.setup(cfg, writer, logger)

        self.adv_source_label = 0
        self.adv_target_label = 1
        self.bceloss = nn.BCEWithLogitsLoss(reduce=False)
        self.loss_fn = get_loss_function(cfg)
        pseudo_cfg = copy.deepcopy(cfg)
        pseudo_cfg['training']['loss']['name'] = 'cross_entropy4d'
        self.pseudo_loss_fn = get_loss_function(pseudo_cfg)
        self.mseloss = nn.MSELoss()
        self.l1loss = nn.L1Loss()
        self.smoothloss = nn.SmoothL1Loss()
        self.triplet_loss = nn.TripletMarginLoss()
        self.kl_distance = nn.KLDivLoss(reduction='none')
예제 #18
0
def run() -> float:
    np.random.seed(0)
    model_dir = config.experiment_dir

    logger.info('=' * 50)
    # logger.info(f'hyperparameters: {params}')

    train_loader, val_loader, test_loader, label_encoder = load_data(args.fold)
    model = create_model()

    optimizer = get_optimizer(config, model.parameters())
    lr_scheduler = get_scheduler(config, optimizer)
    lr_scheduler2 = get_scheduler(
        config, optimizer) if config.scheduler2.name else None
    criterion = get_loss(config)

    if args.weights is None:
        last_epoch = 0
        logger.info(f'training will start from epoch {last_epoch+1}')
    else:
        last_checkpoint = torch.load(args.weights)
        assert last_checkpoint['arch'] == config.model.arch
        model.load_state_dict(last_checkpoint['state_dict'])
        optimizer.load_state_dict(last_checkpoint['optimizer'])
        logger.info(f'checkpoint {args.weights} was loaded.')

        last_epoch = last_checkpoint['epoch']
        logger.info(f'loaded the model from epoch {last_epoch}')

        if args.lr_override != 0:
            set_lr(optimizer, float(args.lr_override))
        elif 'lr' in config.scheduler.params:
            set_lr(optimizer, config.scheduler.params.lr)

    if args.gen_predict:
        print('inference mode')
        generate_submission(val_loader, test_loader, model, label_encoder,
                            last_epoch, args.weights)
        sys.exit(0)

    if args.gen_features:
        print('inference mode')
        generate_features(test_loader, model, args.weights)
        sys.exit(0)

    best_score = 0.0
    best_epoch = 0

    last_lr = get_lr(optimizer)
    best_model_path = args.weights

    for epoch in range(last_epoch + 1, config.train.num_epochs + 1):
        logger.info('-' * 50)

        # if not is_scheduler_continuous(config.scheduler.name):
        #     # if we have just reduced LR, reload the best saved model
        #     lr = get_lr(optimizer)
        #     logger.info(f'learning rate {lr}')
        #
        #     if lr < last_lr - 1e-10 and best_model_path is not None:
        #         last_checkpoint = torch.load(os.path.join(model_dir, best_model_path))
        #         assert(last_checkpoint['arch']==config.model.arch)
        #         model.load_state_dict(last_checkpoint['state_dict'])
        #         optimizer.load_state_dict(last_checkpoint['optimizer'])
        #         logger.info(f'checkpoint {best_model_path} was loaded.')
        #         set_lr(optimizer, lr)
        #         last_lr = lr
        #
        #     if lr < config.train.min_lr * 1.01:
        #         logger.info('reached minimum LR, stopping')
        #         break

        get_lr(optimizer)

        train(train_loader, model, criterion, optimizer, epoch, lr_scheduler,
              lr_scheduler2)
        score = validate(val_loader, model, epoch)

        if not is_scheduler_continuous(config.scheduler.name):
            lr_scheduler.step(score)
        if lr_scheduler2 and not is_scheduler_continuous(
                config.scheduler.name):
            lr_scheduler2.step(score)

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        data_to_save = {
            'epoch': epoch,
            'arch': config.model.arch,
            'state_dict': model.state_dict(),
            'best_score': best_score,
            'score': score,
            'optimizer': optimizer.state_dict(),
            'options': config
        }

        filename = config.version
        if is_best:
            best_model_path = f'{filename}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth'
            save_checkpoint(data_to_save, best_model_path, model_dir)

    logger.info(f'best score: {best_score:.04f}')
    return -best_score
예제 #19
0
파일: train.py 프로젝트: syt2/CRA
def train(cfg, writer, logger):
    # This statement must be declared before using pytorch
    use_cuda = False
    if cfg.get("cuda", None) is not None:
        if cfg.get("cuda", None) != "all":
            os.environ["CUDA_VISIBLE_DEVICES"] = cfg.get("cuda", None)
        use_cuda = torch.cuda.is_available()

    # Setup random seed
    seed = cfg["training"].get("seed", random.randint(1, 10000))
    torch.manual_seed(seed)
    if use_cuda:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # Setup Dataloader
    train_loader, val_loader = get_loader(cfg)

    # Setup Model
    model = get_model(cfg)
    # writer.add_graph(model, torch.rand([1, 3, 224, 224]))
    if use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=list(
                                          range(torch.cuda.device_count())))

    # Setup optimizer, lr_scheduler and loss function
    optimizer = get_optimizer(model.parameters(), cfg)
    scheduler = get_scheduler(optimizer, cfg)
    loss_fn = get_loss_fn(cfg)

    # Setup Metrics
    epochs = cfg["training"]["epochs"]
    recorder = RecorderMeter(epochs)
    start_epoch = 0

    # save model parameters every <n> epochs
    save_interval = cfg["training"]["save_interval"]

    if use_cuda:
        model.cuda()
        loss_fn.cuda()

    # Resume Trained Model
    resume_path = os.path.join(writer.file_writer.get_logdir(),
                               cfg["training"]["resume"])
    best_path = os.path.join(writer.file_writer.get_logdir(),
                             cfg["training"]["best_model"])

    if cfg["training"]["resume"] is not None:
        if os.path.isfile(resume_path):
            logger.info(
                "Loading model and optimizer from checkpoint '{}'".format(
                    resume_path))
            checkpoint = torch.load(resume_path)
            state = checkpoint["state_dict"]
            if torch.cuda.device_count() <= 1:
                state = convert_state_dict(state)
            model.load_state_dict(state)
            optimizer.load_state_dict(checkpoint["optimizer"])
            scheduler.load_state_dict(checkpoint["scheduler"])
            start_epoch = checkpoint["epoch"]
            recorder = checkpoint['recorder']
            logger.info("Loaded checkpoint '{}' (epoch {})".format(
                resume_path, checkpoint["epoch"]))
        else:
            logger.info("No checkpoint found at '{}'".format(resume_path))

    epoch_time = AverageMeter()
    for epoch in range(start_epoch, epochs):
        start_time = time.time()
        need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg *
                                                            (epochs - epoch))
        need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(
            need_hour, need_mins, need_secs)
        logger.info(
            '\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:8.6f}]'.
            format(time_string(), epoch, epochs, need_time, optimizer.
                   param_groups[0]['lr']) +  # scheduler.get_last_lr() >=1.4
            ' [Best : Accuracy={:.2f}]'.format(recorder.max_accuracy(False)))
        train_acc, train_los = train_epoch(train_loader, model, loss_fn,
                                           optimizer, use_cuda, logger)
        val_acc, val_los = validate_epoch(val_loader, model, loss_fn, use_cuda,
                                          logger)
        scheduler.step()

        is_best = recorder.update(epoch, train_los, train_acc, val_los,
                                  val_acc)
        if is_best or epoch % save_interval == 0 or epoch == epochs - 1:  # save model (resume model and best model)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'recorder': recorder,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                }, is_best, best_path, resume_path)

            for name, param in model.named_parameters():  # save histogram
                writer.add_histogram(name,
                                     param.clone().cpu().data.numpy(), epoch)

        writer.add_scalar('Train/loss', train_los, epoch)  # save curves
        writer.add_scalar('Train/acc', train_acc, epoch)
        writer.add_scalar('Val/loss', val_los, epoch)
        writer.add_scalar('Val/acc', val_acc, epoch)

        epoch_time.update(time.time() - start_time)

    writer.close()
예제 #20
0
def train(cfg):
    
    # Setup seeds
    torch.manual_seed(cfg.get('seed', 1337))
    torch.cuda.manual_seed(cfg.get('seed', 1337))
    np.random.seed(cfg.get('seed', 1337))
    random.seed(cfg.get('seed', 1337))

    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Setup Augmentations
    augmentations = cfg['training'].get('augmentations', None)
    data_aug = get_composed_augmentations(augmentations)

    # Setup Dataloader
    data_loader = get_loader(cfg['data']['dataset'])
    data_path = cfg['data']['path']

    t_loader = data_loader(
        data_path,
        is_transform=True,
        split=cfg['data']['train_split'],
        #img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']),
        augmentations=data_aug)

    v_loader = data_loader(
        data_path,
        is_transform=True,
        split=cfg['data']['val_split'],
        #img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']),
        )

    n_classes = t_loader.n_classes
    trainloader = data.DataLoader(t_loader,
                                  batch_size=cfg['training']['batch_size'], 
                                  num_workers=cfg['training']['n_workers'], 
                                  shuffle=True)

    valloader = data.DataLoader(v_loader, 
                                batch_size=cfg['training']['batch_size'], 
                                num_workers=cfg['training']['n_workers'])

    # Setup Metrics
    running_metrics_val = runningScore(n_classes)

    # Setup Model
    model = get_model(cfg['model'], n_classes).to(device)

    model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count()))

    # Setup optimizer, lr_scheduler and loss function
    optimizer_cls = get_optimizer(cfg)
    optimizer_params = {k:v for k, v in cfg['training']['optimizer'].items() 
                        if k != 'name'}

    optimizer = optimizer_cls(model.parameters(), **optimizer_params)
    scheduler = get_scheduler(optimizer, cfg['training']['lr_schedule'])

    loss_fn = get_loss_function(cfg)

    start_iter = 0
    if cfg['training']['resume'] is not None:
        if os.path.isfile(cfg['training']['resume']):
 
            checkpoint = torch.load(cfg['training']['resume'])
            model.load_state_dict(checkpoint["model_state"])
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            scheduler.load_state_dict(checkpoint["scheduler_state"])
            start_iter = checkpoint["epoch"]
            print("=====>",
                "Loaded checkpoint '{}' (iter {})".format(
                    cfg['training']['resume'], checkpoint["epoch"]
                )
            )
        else:
            print("=====>","No checkpoint found at '{}'".format(cfg['training']['resume']))

    val_loss_meter = averageMeter()
    time_meter = averageMeter()

    best_iou = -100.0
    i = start_iter
    flag = True

    while i <= cfg['training']['train_iters'] and flag:
        for (images, labels) in trainloader:
            i += 1
            start_ts = time.time()
            scheduler.step()
            model.train()
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            loss = loss_fn(input=outputs, target=labels)

            loss.backward()
            optimizer.step()
            
            time_meter.update(time.time() - start_ts)

            if (i + 1) % cfg['training']['print_interval'] == 0:
                fmt_str = "Iter [{:d}/{:d}]  Loss: {:.4f}  Time/Image: {:.4f}"
                print_str = fmt_str.format(i + 1,
                                           cfg['training']['train_iters'], 
                                           loss.item(),
                                           time_meter.avg / cfg['training']['batch_size'])

                print(print_str)
                time_meter.reset()

            if (i + 1) % cfg['training']['val_interval'] == 0 or \
               (i + 1) == cfg['training']['train_iters']:
                model.eval()
                with torch.no_grad():
                    for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)):
                        images_val = images_val.to(device)
                        labels_val = labels_val.to(device)

                        outputs = model(images_val)
                        val_loss = loss_fn(input=outputs, target=labels_val)

                        pred = outputs.data.max(1)[1].cpu().numpy()
                        gt = labels_val.data.cpu().numpy()


                        running_metrics_val.update(gt, pred)
                        val_loss_meter.update(val_loss.item())


                print("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg))

                score, class_iou = running_metrics_val.get_scores()
                for k, v in score.items():
                    print(k,':',v)

                for k, v in class_iou.items():
                    print('{}: {}'.format(k, v))

                val_loss_meter.reset()
                running_metrics_val.reset()

                if score["Mean IoU : \t"] >= best_iou:
                    best_iou = score["Mean IoU : \t"]
                    state = {
                        "epoch": i + 1,
                        "model_state": model.state_dict(),
                        "optimizer_state": optimizer.state_dict(),
                        "scheduler_state": scheduler.state_dict(),
                        "best_iou": best_iou,
                    }
                    save_path = os.path.join('./checkpoint',
                                             "{}_{}_best_model.pkl".format(
                                                 cfg['model']['arch'],
                                                 cfg['data']['dataset']))
                    print("saving···")
                    torch.save(state, save_path)

            if (i + 1) == cfg['training']['train_iters']:
                flag = False
                break
예제 #21
0
def train(cfg, logger):

    # Setup Seeds
    torch.manual_seed(cfg.get("seed", 1337))
    torch.cuda.manual_seed(cfg.get("seed", 1337))
    np.random.seed(cfg.get("seed", 1337))
    random.seed(cfg.get("seed", 1337))

    # Setup Device
    device = torch.device("cuda:{}".format(cfg["training"]["gpu_idx"])
                          if torch.cuda.is_available() else "cpu")

    # Setup Augmentations
    augmentations = cfg["training"].get("augmentations", None)

    # Setup Dataloader
    data_loader = get_loader(cfg["data"]["dataset"])
    data_path = cfg["data"]["path"]

    t_loader = data_loader(
        data_path,
        split=cfg["data"]["train_split"],
    )

    v_loader = data_loader(
        data_path,
        split=cfg["data"]["val_split"],
    )

    n_classes = t_loader.n_classes
    n_val = len(v_loader.files['val'])

    trainloader = data.DataLoader(
        t_loader,
        batch_size=cfg["training"]["batch_size"],
        num_workers=cfg["training"]["n_workers"],
        shuffle=True,
    )

    valloader = data.DataLoader(v_loader,
                                batch_size=cfg["training"]["batch_size"],
                                num_workers=cfg["training"]["n_workers"])

    # Setup Metrics
    running_metrics_val = runningScore(n_classes, n_val)

    # Setup Model
    model = get_model(cfg["model"], n_classes).to(device)
    model = torch.nn.DataParallel(model,
                                  device_ids=[cfg["training"]["gpu_idx"]])

    # Setup Optimizer, lr_scheduler and Loss Function
    optimizer_cls = get_optimizer(cfg)
    optimizer_params = {
        k: v
        for k, v in cfg["training"]["optimizer"].items() if k != "name"
    }

    optimizer = optimizer_cls(model.parameters(), **optimizer_params)
    logger.info("Using optimizer {}".format(optimizer))

    scheduler = get_scheduler(optimizer, cfg["training"]["lr_schedule"])

    loss_fn = get_loss_function(cfg)
    logger.info("Using loss {}".format(loss_fn))

    # Resume Trained Model
    if cfg["training"]["resume"] is not None:
        if os.path.isfile(cfg["training"]["resume"]):
            logger.info(
                "Loading model and optimizer from checkpoint '{}'".format(
                    cfg["training"]["resume"]))
            checkpoint = torch.load(cfg["training"]["resume"])
            model.load_state_dict(checkpoint["model_state"])
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            scheduler.load_state_dict(checkpoint["scheduler_state"])
            start_iter = checkpoint["epoch"]
            logger.info("Loaded checkpoint '{}' (iter {})".format(
                cfg["training"]["resume"], checkpoint["epoch"]))
        else:
            logger.info("No checkpoint found at '{}'".format(
                cfg["training"]["resume"]))

    # Start Training
    val_loss_meter = averageMeter()
    time_meter = averageMeter()

    start_iter = 0
    best_dice = -100.0
    i = start_iter
    flag = True

    while i <= cfg["training"]["train_iters"] and flag:
        for (images, labels, img_name) in trainloader:
            i += 1
            start_ts = time.time()
            scheduler.step()
            model.train()
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            loss = loss_fn(input=outputs, target=labels)

            loss.backward()
            optimizer.step()

            time_meter.update(time.time() - start_ts)

            # print train loss
            if (i + 1) % cfg["training"]["print_interval"] == 0:
                fmt_str = "Iter [{:d}/{:d}]  Loss: {:.4f}  Time/Image: {:.4f}"
                print_str = fmt_str.format(
                    i + 1,
                    cfg["training"]["train_iters"],
                    loss.item(),
                    time_meter.avg / cfg["training"]["batch_size"],
                )

                print(print_str)
                logger.info(print_str)
                time_meter.reset()

            # validation
            if (i + 1) % cfg["training"]["val_interval"] == 0 or (
                    i + 1) == cfg["training"]["train_iters"]:
                model.eval()
                with torch.no_grad():
                    for i_val, (images_val, labels_val,
                                img_name_val) in tqdm(enumerate(valloader)):
                        images_val = images_val.to(device)
                        labels_val = labels_val.to(device)

                        outputs = model(images_val)
                        val_loss = loss_fn(input=outputs, target=labels_val)

                        pred = outputs.data.max(1)[1].cpu().numpy()
                        gt = labels_val.data.cpu().numpy()

                        running_metrics_val.update(gt, pred, i_val)
                        val_loss_meter.update(val_loss.item())

                logger.info("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg))

                # print val metrics
                score, class_dice = running_metrics_val.get_scores()
                for k, v in score.items():
                    print(k, v)
                    logger.info("{}: {}".format(k, v))

                for k, v in class_dice.items():
                    logger.info("{}: {}".format(k, v))

                val_loss_meter.reset()
                running_metrics_val.reset()

                # save model
                if score["Dice : \t"] >= best_dice:
                    best_dice = score["Dice : \t"]
                    state = {
                        "epoch": i + 1,
                        "model_state": model.state_dict(),
                        "optimizer_state": optimizer.state_dict(),
                        "scheduler_state": scheduler.state_dict(),
                        "best_dice": best_dice,
                    }
                    save_path = os.path.join(
                        cfg["training"]["model_dir"],
                        "{}_{}.pkl".format(cfg["model"]["arch"],
                                           cfg["data"]["dataset"]),
                    )
                    torch.save(state, save_path)

            if (i + 1) == cfg["training"]["train_iters"]:
                flag = False
                break
예제 #22
0
def train(cfg, writer, logger):

    # Setup seeds
    torch.manual_seed(cfg.get("seed", 1337))
    torch.cuda.manual_seed(cfg.get("seed", 1337))
    np.random.seed(cfg.get("seed", 1337))
    random.seed(cfg.get("seed", 1337))

    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Setup Dataloader
    trainloader = get_loader(cfg, "train")
    valloader = get_loader(cfg, "val")

    n_classes = cfg["data"]["n_classes"]
    n_channels = cfg["data"]["channels"]

    # Setup Metrics
    running_metrics_val = runningScore(n_classes)

    # Setup Model
    model = get_model(cfg, n_classes, n_channels).to(device)
    model = torch.nn.DataParallel(model,
                                  device_ids=range(torch.cuda.device_count()))

    # Setup optimizer, lr_scheduler and loss function
    optimizer_cls = get_optimizer(cfg)
    optimizer_params = {
        k: v
        for k, v in cfg["training"]["optimizer"].items() if k != "name"
    }

    optimizer = optimizer_cls(model.parameters(), **optimizer_params)
    logger.info("Using optimizer {}".format(optimizer))

    scheduler = get_scheduler(optimizer, cfg["training"]["lr_schedule"])

    loss_fn = get_loss_function(cfg)
    logger.info("Using loss {}".format(loss_fn))

    start_iter = 0
    if cfg["training"]["resume"] is not None:
        if os.path.isfile(cfg["training"]["resume"]):
            logger.info(
                "Loading model and optimizer from checkpoint '{}'".format(
                    cfg["training"]["resume"]))
            checkpoint = torch.load(cfg["training"]["resume"])
            model.module.load_state_dict(checkpoint["model_state"])
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            scheduler.load_state_dict(checkpoint["scheduler_state"])
            start_iter = checkpoint["epoch"]
            logger.info("Loaded checkpoint '{}' (iter {})".format(
                cfg["training"]["resume"], checkpoint["epoch"]))
        else:
            logger.info("No checkpoint found at '{}'".format(
                cfg["training"]["resume"]))

    val_loss_meter = averageMeter()
    time_meter = averageMeter()

    best_iou = -100.0
    i = start_iter
    flag = True

    # fig = plt.figure()
    # plt.rcParams['xtick.major.pad'] = '15'
    # fig.show()
    # fig.canvas.draw()

    while i <= cfg["training"]["train_iters"] and flag:
        for (images, labels) in trainloader:
            i += 1
            start_ts = time.time()
            model.train()
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            loss = loss_fn(input=outputs, target=labels)

            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            # plot_grad_flow(model.named_parameters(), fig)

            # zero mean conv for layer 1 of dsm encoder
            optimizer.step()
            scheduler.step()
            # m = model._modules['module'].encoderDSM._modules['0']._modules['0']
            # model._modules['module'].encoderDSM._modules['0']._modules['0'].weight = m.weight - torch.mean(m.weight)
            model = zero_mean(model, all=False)

            time_meter.update(time.time() - start_ts)

            if (i + 1) % cfg["training"]["print_interval"] == 0:
                fmt_str = "Iter [{:d}/{:d}]  Loss: {:.4f}  Time/Image: {:.4f}"
                print_str = fmt_str.format(
                    i + 1,
                    cfg["training"]["train_iters"],
                    loss.item(),
                    time_meter.avg / cfg["training"]["batch_size"],
                )

                print(print_str)
                logger.info(print_str)
                writer.add_scalar("loss/train_loss", loss.item(), i + 1)
                time_meter.reset()

            if (i + 1) % cfg["training"]["val_interval"] == 0 or (
                    i + 1) == cfg["training"]["train_iters"]:
                model.eval()
                with torch.no_grad():
                    for i_val, (images_val,
                                labels_val) in tqdm(enumerate(valloader)):
                        images_val = images_val.to(device)
                        labels_val = labels_val.to(device)

                        outputs = model(images_val)
                        val_loss = loss_fn(input=outputs, target=labels_val)

                        pred = outputs.data.max(1)[1].cpu().numpy()
                        gt = labels_val.data.cpu().numpy()
                        # plt.imshow(v_loader.decode_segmap(gt[0,:,:]))
                        # plt.imshow(v_loader.decode_segmap(pred[0, :, :]))
                        running_metrics_val.update(gt, pred)
                        val_loss_meter.update(val_loss.item())

                writer.add_scalar("loss/val_loss", val_loss_meter.avg, i + 1)
                logger.info("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg))

                score, class_iou = running_metrics_val.get_scores()
                for k, v in score.items():
                    #print(k, v)
                    logger.info("{}: {}".format(k, v))
                    writer.add_scalar("val_metrics/{}".format(k), v, i + 1)

                for k, v in class_iou.items():
                    logger.info("{}: {}".format(k, v))
                    writer.add_scalar("val_metrics/cls_{}".format(k), v, i + 1)

                val_loss_meter.reset()
                running_metrics_val.reset()

                if score["Mean IoU : \t"] >= best_iou:
                    best_iou = score["Mean IoU : \t"]
                    state = {
                        "epoch": i + 1,
                        "model_state": model.state_dict(),
                        "optimizer_state": optimizer.state_dict(),
                        "scheduler_state": scheduler.state_dict(),
                        "best_iou": best_iou,
                    }
                    save_path = os.path.join(
                        writer.file_writer.get_logdir(),
                        "{}_{}_best_model.pkl".format(cfg["model"]["arch"],
                                                      cfg["data"]["dataset"]),
                    )
                    torch.save(state, save_path)

            if (i + 1) == cfg["training"]["train_iters"]:
                flag = False
                break
예제 #23
0
def main():
    #     args = parse_args()
    IMAGE_PATH = 'data/images/'
    num_classes_1 = 168
    num_classes_2 = 11
    num_classes_3 = 7
    stats = (0.0692, 0.2051)

    train_df = pd.read_csv('data/train_with_folds.csv')
    # train_df = train_df.set_index(['image_id'])
    # train_df = train_df.drop(['grapheme'], axis=1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Data Loaders
    # df_train, df_val = train_test_split(train_df, test_size=0.2, random_state=2021)

    # train_transform = get_transform(128)
    train_transform = A.Compose([
        A.CoarseDropout(max_holes=1, max_width=64, max_height=64, p=0.9),
        A.ShiftScaleRotate(rotate_limit=5, p=0.9),
        A.Normalize(mean=stats[0], std=stats[1], always_apply=True)
    ])
    val_transform = A.Compose(
        [A.Normalize(mean=stats[0], std=stats[1], always_apply=True)])

    BATCH_SIZE = 50
    folds = [{
        'train': [1, 2, 3, 4],
        'val': [0]
    }, {
        'train': [0, 2, 3, 4],
        'val': [1]
    }, {
        'train': [1, 0, 3, 4],
        'val': [2]
    }, {
        'train': [1, 2, 0, 4],
        'val': [3]
    }, {
        'train': [1, 2, 3, 0],
        'val': [4]
    }]

    # Loop over folds
    for fld in range(1):
        fld = 4
        print(f'Train fold: {fld}')

        train_loader = get_loader(train_df,
                                  IMAGE_PATH,
                                  folds=folds[fld]['train'],
                                  batch_size=BATCH_SIZE,
                                  workers=4,
                                  shuffle=True,
                                  transform=train_transform)
        val_loader = get_loader(train_df,
                                IMAGE_PATH,
                                folds=folds[fld]['val'],
                                batch_size=BATCH_SIZE,
                                workers=4,
                                shuffle=False,
                                transform=val_transform)

        # Build Model
        model = load_model('seresnext50_32x4d', pretrained=True)
        model = model.cuda()

        # Optimizer
        optimizer = get_optimizer(model, lr=.00016)

        # Loss
        criterion1 = get_criterion()

        # Training
        history = pd.DataFrame()
        history2 = pd.DataFrame()

        torch.cuda.empty_cache()
        gc.collect()

        best = 0
        best2 = 1e10
        n_epochs = 100
        early_epoch = 0

        # Scheduler
        scheduler = get_scheduler(optimizer,
                                  train_loader=train_loader,
                                  epochs=n_epochs)

        # print('Loading previous training...')
        # state = torch.load('model.pth')
        # model.load_state_dict(state['model_state'])
        # best = state['kaggle']
        # best2 = state['loss']
        # print(f'Loaded model with kaggle score: {best}, loss: {best2}')
        # optimizer.load_state_dict(state['opt_state'])
        # scheduler.load_state_dict(state['scheduler_state'])
        # early_epoch = state['epoch'] + 1
        # print(f'Beginning at epoch {early_epoch}')
        # print('')

        for epoch in range(n_epochs - early_epoch):
            epoch += early_epoch
            torch.cuda.empty_cache()
            gc.collect()

            # ###################################################################
            # ############## TRAINING ###########################################
            # ###################################################################

            model.train()
            total_loss = 0
            total_loss_1 = 0
            total_loss_2 = 0
            total_loss_3 = 0

            # ratio = pow(.5,epoch/50)
            # ratio = 0.7
            ratio = 1.0

            t = tqdm(train_loader)
            for batch_idx, (img_batch, y_batch) in enumerate(t):
                img_batch = img_batch.cuda().float()
                y_batch = y_batch.cuda().long()

                optimizer.zero_grad()

                label1 = y_batch[:, 0]
                label2 = y_batch[:, 1]
                label3 = y_batch[:, 2]
                rand = np.random.rand()
                if rand < 0.5:
                    images, targets = mixup(img_batch, label1, label2, label3,
                                            0.4)
                    output1, output2, output3 = model(images)
                    l1, l2, l3 = mixup_criterion(output1,
                                                 output2,
                                                 output3,
                                                 targets,
                                                 rate=ratio)
                elif rand < 1:
                    images, targets = cutmix(img_batch, label1, label2, label3,
                                             0.4)
                    output1, output2, output3 = model(images)
                    l1, l2, l3 = cutmix_criterion(output1,
                                                  output2,
                                                  output3,
                                                  targets,
                                                  rate=ratio)
                # else:
                #     output1,output2,output3 = model(img_batch)
                #     l1, l2, l3 = criterion1(output1,output2,output3, y_batch)

                loss = l1 * .4 + l2 * .3 + l3 * .3
                total_loss += loss
                total_loss_1 += l1 * .4
                total_loss_2 += l2 * .3
                total_loss_3 += l3 * .3
                t.set_description(
                    f'Epoch {epoch+1}/{n_epochs}, LR: %6f, Ratio: %.4f, Loss: %.4f, Root loss: %.4f, Vowel loss: %.4f, Consonant loss: %.4f'
                    % (optimizer.state_dict()['param_groups'][0]['lr'], ratio,
                       total_loss / (batch_idx + 1), total_loss_1 /
                       (batch_idx + 1), total_loss_2 /
                       (batch_idx + 1), total_loss_3 / (batch_idx + 1)))
                # t.set_description(f'Epoch {epoch}/{n_epochs}, LR: %6f, Loss: %.4f'%(optimizer.state_dict()['param_groups'][0]['lr'],total_loss/(batch_idx+1)))

                if history is not None:
                    history.loc[epoch + batch_idx / len(train_loader),
                                'train_loss'] = loss.data.cpu().numpy()
                    history.loc[
                        epoch + batch_idx / len(train_loader),
                        'lr'] = optimizer.state_dict()['param_groups'][0]['lr']

                loss.backward()
                optimizer.step()
                # if scheduler is not None:
                #     scheduler.step()

            # ###################################################################
            # ############## VALIDATION #########################################
            # ###################################################################

            model.eval()
            loss = 0

            preds_1 = []
            preds_2 = []
            preds_3 = []
            tars_1 = []
            tars_2 = []
            tars_3 = []
            with torch.no_grad():
                for img_batch, y_batch in val_loader:
                    img_batch = img_batch.cuda().float()
                    y_batch = y_batch.cuda().long()

                    o1, o2, o3 = model(img_batch)

                    l1, l2, l3 = criterion1(o1, o2, o3, y_batch)
                    loss += l1 * .4 + l2 * .3 + l3 * .3

                    for j in range(len(o1)):
                        preds_1.append(torch.argmax(F.softmax(o1[j]), -1))
                        preds_2.append(torch.argmax(F.softmax(o2[j]), -1))
                        preds_3.append(torch.argmax(F.softmax(o3[j]), -1))
                    for i in y_batch:
                        tars_1.append(i[0].data.cpu().numpy())
                        tars_2.append(i[1].data.cpu().numpy())
                        tars_3.append(i[2].data.cpu().numpy())

            preds_1 = [p.data.cpu().numpy() for p in preds_1]
            preds_2 = [p.data.cpu().numpy() for p in preds_2]
            preds_3 = [p.data.cpu().numpy() for p in preds_3]
            preds_1 = np.array(preds_1).T.reshape(-1)
            preds_2 = np.array(preds_2).T.reshape(-1)
            preds_3 = np.array(preds_3).T.reshape(-1)

            scores = []
            scores.append(
                sklearn.metrics.recall_score(tars_1, preds_1, average='macro'))
            scores.append(
                sklearn.metrics.recall_score(tars_2, preds_2, average='macro'))
            scores.append(
                sklearn.metrics.recall_score(tars_3, preds_3, average='macro'))
            final_score = np.average(scores, weights=[2, 1, 1])

            loss /= len(val_loader)

            if history2 is not None:
                history2.loc[epoch, 'val_loss'] = loss.cpu().numpy()
                history2.loc[epoch, 'acc'] = final_score
                history2.loc[epoch, 'root_acc'] = scores[0]
                history2.loc[epoch, 'vowel_acc'] = scores[1]
                history2.loc[epoch, 'consonant_acc'] = scores[2]

            if scheduler is not None:
                scheduler.step(final_score)

            print(
                f'Dev loss: %.4f, Kaggle: {final_score}, Root acc: {scores[0]}, Vowel acc: {scores[1]}, Consonant acc: {scores[2]}'
                % (loss))

            if epoch > 0:
                history2['acc'].plot()
                plt.savefig(f'epoch%03d_{fld}_acc.png' % (epoch + 1))
                plt.clf()

            if loss < best2:
                best2 = loss
                print(f'Saving best model... (loss)')
                torch.save(
                    {
                        'epoch': epoch,
                        'loss': loss,
                        'kaggle': final_score,
                        'model_state': model.state_dict(),
                        'opt_state': optimizer.state_dict(),
                        'scheduler_state': scheduler.state_dict()
                    }, f'model-1_{fld}.pth')

            if final_score > best:
                best = final_score
                print(f'Saving best model... (acc)')
                torch.save(
                    {
                        'epoch': epoch,
                        'loss': loss,
                        'kaggle': final_score,
                        'model_state': model.state_dict(),
                        'opt_state': optimizer.state_dict(),
                        'scheduler_state': scheduler.state_dict()
                    }, f'model_{fld}.pth')
예제 #24
0
    def __init__(self, config_path, run_dir):
        self.config_path = coerce_to_path_and_check_exist(config_path)
        self.run_dir = coerce_to_path_and_create_dir(run_dir)
        self.logger = get_logger(self.run_dir, name="trainer")
        self.print_and_log_info(
            "Trainer initialisation: run directory is {}".format(run_dir))

        shutil.copy(self.config_path, self.run_dir)
        self.print_and_log_info("Config {} copied to run directory".format(
            self.config_path))

        with open(self.config_path) as fp:
            cfg = yaml.load(fp, Loader=yaml.FullLoader)

        if torch.cuda.is_available():
            type_device = "cuda"
            nb_device = torch.cuda.device_count()
            # XXX: set to False when input image sizes are not fixed
            torch.backends.cudnn.benchmark = cfg["training"].get(
                "cudnn_benchmark", True)

        else:
            type_device = "cpu"
            nb_device = None
        self.device = torch.device(type_device)
        self.print_and_log_info("Using {} device, nb_device is {}".format(
            type_device, nb_device))

        # Datasets and dataloaders
        self.dataset_kwargs = cfg["dataset"]
        self.dataset_name = self.dataset_kwargs.pop("name")
        train_dataset = get_dataset(self.dataset_name)("train",
                                                       **self.dataset_kwargs)
        val_dataset = get_dataset(self.dataset_name)("val",
                                                     **self.dataset_kwargs)
        self.restricted_labels = sorted(
            self.dataset_kwargs["restricted_labels"])
        self.n_classes = len(self.restricted_labels) + 1
        self.is_val_empty = len(val_dataset) == 0
        self.print_and_log_info("Dataset {} instantiated with {}".format(
            self.dataset_name, self.dataset_kwargs))
        self.print_and_log_info(
            "Found {} classes, {} train samples, {} val samples".format(
                self.n_classes, len(train_dataset), len(val_dataset)))

        self.batch_size = cfg["training"]["batch_size"]
        self.n_workers = cfg["training"]["n_workers"]
        self.train_loader = DataLoader(train_dataset,
                                       batch_size=self.batch_size,
                                       num_workers=self.n_workers,
                                       shuffle=True)
        self.val_loader = DataLoader(val_dataset,
                                     batch_size=self.batch_size,
                                     num_workers=self.n_workers)
        self.print_and_log_info(
            "Dataloaders instantiated with batch_size={} and n_workers={}".
            format(self.batch_size, self.n_workers))

        self.n_batches = len(self.train_loader)
        self.n_iterations, self.n_epoches = cfg["training"].get(
            "n_iterations"), cfg["training"].get("n_epoches")
        assert not (self.n_iterations is not None
                    and self.n_epoches is not None)
        if self.n_iterations is not None:
            self.n_epoches = max(self.n_iterations // self.n_batches, 1)
        else:
            self.n_iterations = self.n_epoches * len(self.train_loader)

        # Model
        self.model_kwargs = cfg["model"]
        self.model_name = self.model_kwargs.pop("name")
        model = get_model(self.model_name)(self.n_classes,
                                           **self.model_kwargs).to(self.device)
        self.model = torch.nn.DataParallel(model,
                                           device_ids=range(
                                               torch.cuda.device_count()))
        self.print_and_log_info("Using model {} with kwargs {}".format(
            self.model_name, self.model_kwargs))
        self.print_and_log_info('Number of trainable parameters: {}'.format(
            f'{count_parameters(self.model):,}'))

        # Optimizer
        optimizer_params = cfg["training"]["optimizer"] or {}
        optimizer_name = optimizer_params.pop("name", None)
        self.optimizer = get_optimizer(optimizer_name)(model.parameters(),
                                                       **optimizer_params)
        self.print_and_log_info("Using optimizer {} with kwargs {}".format(
            optimizer_name, optimizer_params))

        # Scheduler
        scheduler_params = cfg["training"].get("scheduler", {}) or {}
        scheduler_name = scheduler_params.pop("name", None)
        self.scheduler_update_range = scheduler_params.pop(
            "update_range", "epoch")
        assert self.scheduler_update_range in ["epoch", "batch"]
        if scheduler_name == "multi_step" and isinstance(
                scheduler_params["milestones"][0], float):
            n_tot = self.n_epoches if self.scheduler_update_range == "epoch" else self.n_iterations
            scheduler_params["milestones"] = [
                round(m * n_tot) for m in scheduler_params["milestones"]
            ]
        self.scheduler = get_scheduler(scheduler_name)(self.optimizer,
                                                       **scheduler_params)
        self.cur_lr = -1
        self.print_and_log_info("Using scheduler {} with parameters {}".format(
            scheduler_name, scheduler_params))

        # Loss
        loss_name = cfg["training"]["loss"]
        self.criterion = get_loss(loss_name)()
        self.print_and_log_info("Using loss {}".format(self.criterion))

        # Pretrained / Resume
        checkpoint_path = cfg["training"].get("pretrained")
        checkpoint_path_resume = cfg["training"].get("resume")
        assert not (checkpoint_path is not None
                    and checkpoint_path_resume is not None)
        if checkpoint_path is not None:
            self.load_from_tag(checkpoint_path)
        elif checkpoint_path_resume is not None:
            self.load_from_tag(checkpoint_path_resume, resume=True)
        else:
            self.start_epoch, self.start_batch = 1, 1

        # Train metrics
        train_iter_interval = cfg["training"].get(
            "train_stat_interval", self.n_epoches * self.n_batches // 200)
        self.train_stat_interval = train_iter_interval
        self.train_time = AverageMeter()
        self.train_loss = AverageMeter()
        self.train_metrics_path = self.run_dir / TRAIN_METRICS_FILE
        with open(self.train_metrics_path, mode="w") as f:
            f.write(
                "iteration\tepoch\tbatch\ttrain_loss\ttrain_time_per_img\n")

        # Val metrics
        val_iter_interval = cfg["training"].get(
            "val_stat_interval", self.n_epoches * self.n_batches // 100)
        self.val_stat_interval = val_iter_interval
        self.val_loss = AverageMeter()
        self.val_metrics = RunningMetrics(self.restricted_labels)
        self.val_current_score = None
        self.val_metrics_path = self.run_dir / VAL_METRICS_FILE
        with open(self.val_metrics_path, mode="w") as f:
            f.write("iteration\tepoch\tbatch\tval_loss\t" +
                    "\t".join(self.val_metrics.names) + "\n")
예제 #25
0
    def __init__(self, cfg, writer, logger):
        # super(CustomModel, self).__init__()
        self.cfg = cfg
        self.writer = writer
        self.class_numbers = 19
        self.logger = logger
        cfg_model = cfg['model']
        self.cfg_model = cfg_model
        self.best_iou = -100
        self.iter = 0
        self.nets = []
        self.split_gpu = 0
        self.default_gpu = cfg['model']['default_gpu']
        self.PredNet_Dir = None
        self.valid_classes = cfg['training']['valid_classes']
        self.G_train = True
        self.objective_vectors = np.zeros([19, 256])
        self.objective_vectors_num = np.zeros([19])
        self.objective_vectors_dis = np.zeros([19, 19])
        self.class_threshold = np.zeros(self.class_numbers)
        self.class_threshold = np.full([19], 0.95)
        self.metrics = CustomMetrics(self.class_numbers)
        self.cls_feature_weight = cfg['training']['cls_feature_weight']

        bn = cfg_model['bn']
        if bn == 'sync_bn':
            BatchNorm = SynchronizedBatchNorm2d
        # elif bn == 'sync_abn':
        #     BatchNorm = InPlaceABNSync
        elif bn == 'bn':
            BatchNorm = nn.BatchNorm2d
        # elif bn == 'abn':
        #     BatchNorm = InPlaceABN
        elif bn == 'gn':
            BatchNorm = nn.GroupNorm
        else:
            raise NotImplementedError(
                'batch norm choice {} is not implemented'.format(bn))
        self.PredNet = DeepLab(
            num_classes=19,
            backbone=cfg_model['basenet']['version'],
            output_stride=16,
            bn=cfg_model['bn'],
            freeze_bn=True,
        ).cuda()
        self.load_PredNet(cfg, writer, logger, dir=None, net=self.PredNet)
        self.PredNet_DP = self.init_device(self.PredNet,
                                           gpu_id=self.default_gpu,
                                           whether_DP=True)
        self.PredNet.eval()
        self.PredNet_num = 0

        self.BaseNet = DeepLab(
            num_classes=19,
            backbone=cfg_model['basenet']['version'],
            output_stride=16,
            bn=cfg_model['bn'],
            freeze_bn=False,
        )

        logger.info('the backbone is {}'.format(
            cfg_model['basenet']['version']))

        self.BaseNet_DP = self.init_device(self.BaseNet,
                                           gpu_id=self.default_gpu,
                                           whether_DP=True)
        self.nets.extend([self.BaseNet])
        self.nets_DP = [self.BaseNet_DP]

        self.optimizers = []
        self.schedulers = []
        # optimizer_cls = get_optimizer(cfg)
        optimizer_cls = torch.optim.SGD
        optimizer_params = {
            k: v
            for k, v in cfg['training']['optimizer'].items() if k != 'name'
        }
        # optimizer_cls_D = torch.optim.SGD
        # optimizer_params_D = {k:v for k, v in cfg['training']['optimizer_D'].items()
        #                     if k != 'name'}
        self.BaseOpti = optimizer_cls(self.BaseNet.parameters(),
                                      **optimizer_params)
        self.optimizers.extend([self.BaseOpti])

        self.BaseSchedule = get_scheduler(self.BaseOpti,
                                          cfg['training']['lr_schedule'])
        self.schedulers.extend([self.BaseSchedule])
        self.setup(cfg, writer, logger)

        self.adv_source_label = 0
        self.adv_target_label = 1
        self.bceloss = nn.BCEWithLogitsLoss(size_average=True)
        self.loss_fn = get_loss_function(cfg)
        self.mseloss = nn.MSELoss()
        self.l1loss = nn.L1Loss()
        self.smoothloss = nn.SmoothL1Loss()
        self.triplet_loss = nn.TripletMarginLoss()
예제 #26
0
파일: train.py 프로젝트: gittigxuy/imet
def run() -> float:
    np.random.seed(0)
    model_dir = config.experiment_dir

    logger.info('=' * 50)

    train_loader, val_loader, test_loader = load_data(args.fold)
    logger.info(f'creating a model {config.model.arch}')
    model = create_model(config, pretrained=args.weights is None).cuda()
    criterion = get_loss(config)

    if args.summary:
        torchsummary.summary(model, (3, config.model.input_size, config.model.input_size))

    if args.lr_finder:
        optimizer = get_optimizer(config, model.parameters())
        lr_finder(train_loader, model, criterion, optimizer)
        sys.exit()

    if args.weights is None and config.train.head_only_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.steps} steps')
        logger.info(f'max_lr will be {config.train.warmup.max_lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer)

        freeze_layers(model)
        train_epoch(train_loader, model, criterion, optimizer, 0,
                    warmup_scheduler, None, config.train.warmup.steps)
        unfreeze_layers(model)

    if args.weights is None and config.train.enable_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.steps} steps')
        logger.info(f'max_lr will be {config.train.warmup.max_lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer)
        train_epoch(train_loader, model, criterion, optimizer, 0,
                    warmup_scheduler, None, config.train.warmup.steps)

    optimizer = get_optimizer(config, model.parameters())

    if args.weights is None:
        last_epoch = -1
    else:
        last_checkpoint = torch.load(args.weights)
        model_arch = last_checkpoint['arch'].replace('se_', 'se')

        if model_arch != config.model.arch:
            dprint(model_arch)
            dprint(config.model.arch)
            assert model_arch == config.model.arch

        model.load_state_dict(last_checkpoint['state_dict'])
        if 'optimizer' in last_checkpoint.keys():
            optimizer.load_state_dict(last_checkpoint['optimizer'])
        logger.info(f'checkpoint loaded: {args.weights}')

        last_epoch = last_checkpoint['epoch'] if 'epoch' in last_checkpoint.keys() else 99
        logger.info(f'loaded the model from epoch {last_epoch}')

        if args.lr != 0:
            set_lr(optimizer, float(args.lr))
        elif 'lr' in config.optimizer.params:
            set_lr(optimizer, config.optimizer.params.lr)
        elif 'base_lr' in config.scheduler.params:
            set_lr(optimizer, config.scheduler.params.base_lr)

    if not args.cosine:
        lr_scheduler = get_scheduler(config.scheduler, optimizer, last_epoch=
                                     (last_epoch if config.scheduler.name != 'cyclic_lr' else -1))
        assert config.scheduler2.name == ''
        lr_scheduler2 = get_scheduler(config.scheduler2, optimizer, last_epoch=last_epoch) \
                        if config.scheduler2.name else None
    else:
        epoch_size = min(len(train_loader), config.train.max_steps_per_epoch) \
                     * config.train.batch_size

        set_lr(optimizer, float(config.cosine.start_lr))
        lr_scheduler = CosineLRWithRestarts(optimizer,
                                            batch_size=config.train.batch_size,
                                            epoch_size=epoch_size,
                                            restart_period=config.cosine.period,
                                            period_inc=config.cosine.period_inc,
                                            max_period=config.cosine.max_period)
        lr_scheduler2 = None

    if args.predict_oof or args.predict_test:
        print('inference mode')
        assert args.weights is not None

        if args.predict_oof:
            gen_train_prediction(val_loader, model, last_epoch, args.weights)
        else:
            gen_test_prediction(test_loader, model, args.weights)

        sys.exit()

    logger.info(f'training will start from epoch {last_epoch + 1}')

    best_score = 0.0
    best_epoch = 0

    last_lr = get_lr(optimizer)
    best_model_path = args.weights

    for epoch in range(last_epoch + 1, config.train.num_epochs):
        logger.info('-' * 50)

        if not is_scheduler_continuous(lr_scheduler) and lr_scheduler2 is None:
            # if we have just reduced LR, reload the best saved model
            lr = get_lr(optimizer)

            if lr < last_lr - 1e-10 and best_model_path is not None:
                logger.info(f'learning rate dropped: {lr}, reloading')
                last_checkpoint = torch.load(best_model_path)

                assert(last_checkpoint['arch']==config.model.arch)
                model.load_state_dict(last_checkpoint['state_dict'])
                optimizer.load_state_dict(last_checkpoint['optimizer'])
                logger.info(f'checkpoint loaded: {best_model_path}')
                set_lr(optimizer, lr)
                last_lr = lr

        if config.train.lr_decay_coeff != 0 and epoch in config.train.lr_decay_milestones:
            n_cycles = config.train.lr_decay_milestones.index(epoch) + 1
            total_coeff = config.train.lr_decay_coeff ** n_cycles
            logger.info(f'artificial LR scheduler: made {n_cycles} cycles, decreasing LR by {total_coeff}')

            set_lr(optimizer, config.scheduler.params.base_lr * total_coeff)
            lr_scheduler = get_scheduler(config.scheduler, optimizer,
                                         coeff=total_coeff, last_epoch=-1)
                                         # (last_epoch if config.scheduler.name != 'cyclic_lr' else -1))

        if isinstance(lr_scheduler, CosineLRWithRestarts):
            restart = lr_scheduler.epoch_step()
            if restart:
                logger.info('cosine annealing restarted, resetting the best metric')
                best_score = min(config.cosine.min_metric_val, best_score)

        train_epoch(train_loader, model, criterion, optimizer, epoch,
                    lr_scheduler, lr_scheduler2, config.train.max_steps_per_epoch)
        score, _, _ = validate(val_loader, model, epoch)

        if type(lr_scheduler) == ReduceLROnPlateau:
            lr_scheduler.step(metrics=score)
        elif not is_scheduler_continuous(lr_scheduler):
            lr_scheduler.step()

        if type(lr_scheduler2) == ReduceLROnPlateau:
            lr_scheduler2.step(metrics=score)
        elif lr_scheduler2 and not is_scheduler_continuous(lr_scheduler2):
            lr_scheduler2.step()

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        if is_best:
            best_model_path = os.path.join(model_dir,
                f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth')

            data_to_save = {
                'epoch': epoch,
                'arch': config.model.arch,
                'state_dict': model.state_dict(),
                'score': score,
                'optimizer': optimizer.state_dict(),
                'config': config
            }

            torch.save(data_to_save, best_model_path)
            logger.info(f'a snapshot was saved to {best_model_path}')

    logger.info(f'best score: {best_score:.04f}')
    return -best_score
예제 #27
0
    def __init__(self, cfg):
        """Construct a Unet generator
        Parameters in cfg:
            input_nc (int)  -- the number of channels in input images
            output_nc (int) -- the number of channels in output images
            num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7,
                                image of size 128x128 will become of size 1x1 # at the bottleneck
            ngf (int)       -- the number of filters in the last conv layer
            norm_layer      -- normalization layer

        We construct the U-Net from the innermost layer to the outermost layer.
        It is a recursive process.
        """
        super(Unet, self).__init__()

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        input_nc = cfg['model']['input_nc']
        output_nc = cfg['model']['output_nc']
        num_downs = cfg['model']['num_downs']
        ngf = cfg['model']['ngf']
        norm_layer = nn.BatchNorm2d if cfg['model'][
            'norm_layer'] == 'batch' else nn.InstanceNorm2d
        use_dropout = cfg['model']['use_dropout']
        self.hook = []
        # construct unet structure
        unet_block = UnetSkipConnectionBlock(
            ngf * 8,
            ngf * 8,
            input_nc=None,
            submodule=None,
            norm_layer=norm_layer,
            innermost=True)  # add the innermost layer
        for i in range(num_downs -
                       5):  # add intermediate layers with ngf * 8 filters
            unet_block = UnetSkipConnectionBlock(ngf * 8,
                                                 ngf * 8,
                                                 input_nc=None,
                                                 submodule=unet_block,
                                                 norm_layer=norm_layer,
                                                 use_dropout=use_dropout)
        # gradually reduce the number of filters from ngf * 8 to ngf
        unet_block = UnetSkipConnectionBlock(ngf * 4,
                                             ngf * 8,
                                             input_nc=None,
                                             submodule=unet_block,
                                             norm_layer=norm_layer)
        unet_block = UnetSkipConnectionBlock(ngf * 2,
                                             ngf * 4,
                                             input_nc=None,
                                             submodule=unet_block,
                                             norm_layer=norm_layer)
        unet_block = UnetSkipConnectionBlock(ngf,
                                             ngf * 2,
                                             input_nc=None,
                                             submodule=unet_block,
                                             norm_layer=norm_layer)
        self.model = UnetSkipConnectionBlock(
            output_nc,
            ngf,
            input_nc=input_nc,
            submodule=unet_block,
            outermost=True)  # add the outermost layer
        self.inputsF = [
            Hook(layer) for layer in list(self.modules())
            if isinstance(layer, nn.Conv2d)
        ]
        self.out = None
        self.criterion = cross_entropy2d
        self.loss = None
        self.optimizer = get_optimizer(self.parameters(), cfg)
        if cfg["training"]["lr_schedule"] is not None:
            self.scheduler = get_scheduler(self.optimizer,
                                           cfg["training"]["lr_schedule"])
예제 #28
0
def run(config_file, device_id, idx_fold):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id)
    print('info: use gpu No.{}'.format(device_id))

    config = load_config(config_file)

    # for n-folds loop
    if config.data.params.idx_fold == -1:
        config.data.params.idx_fold = idx_fold
        config.work_dir = config.work_dir + '_fold{}'.format(idx_fold)
    elif config.data.params.idx_fold == 0:
        original_fold = int(config.work_dir.split('_fold')[1])
        if original_fold == idx_fold:
            raise Exception(
                'if you specify fold 0, you should use train.py or resume from fold 1.'
            )
        config.data.params.idx_fold = idx_fold
        config.work_dir = config.work_dir.split('_fold')[0] + '_fold{}'.format(
            idx_fold)
    else:
        raise Exception('you should use train.py if idx_fold is specified.')
    print('info: training for fold {}'.format(idx_fold))

    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(
            df_path=config.data.train_df_path,
            data_dir=config.data.train_dir,
            features=config.data.features,
            phase=phase,
            img_size=(config.data.height, config.data.width),
            batch_size=config.train.batch_size,
            num_workers=config.num_workers,
            idx_fold=config.data.params.idx_fold,
            transforms=all_transforms[phase],
            horizontal_flip=config.train.horizontal_flip,
            model_scale=config.data.model_scale,
            debug=config.debug,
            pseudo_path=config.data.pseudo_path,
        )
        for phase in ['train', 'valid']
    }

    # create segmentation model with pre trained encoder
    num_features = len(config.data.features)
    print('info: num_features =', num_features)
    model = CenterNetFPN(
        slug=config.model.encoder,
        num_classes=num_features,
    )

    optimizer = get_optimizer(model, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model, device=get_device())

    # train setting
    criterion, callbacks = get_criterion_and_callback(config)

    if config.train.early_stop_patience > 0:
        callbacks.append(
            EarlyStoppingCallback(patience=config.train.early_stop_patience))

    if config.train.accumulation_size > 0:
        accumulation_steps = config.train.accumulation_size // config.train.batch_size
        callbacks.extend(
            [OptimizerCallback(accumulation_steps=accumulation_steps)])

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/last_full.pth'):
        callbacks.append(
            CheckpointCallback(resume=config.work_dir +
                               '/checkpoints/last_full.pth'))

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        main_metric=config.train.main_metric,
        minimize_metric=config.train.minimize_metric,
        callbacks=callbacks,
        verbose=True,
        fp16=config.train.fp16,
    )
예제 #29
0
def run(config_file):
    config = load_config(config_file)
    #set up the environment flags for working with the KAGGLE GPU OR COLAB_GPU
    if 'COLAB_GPU' in os.environ:
        config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir
    elif 'KAGGLE_WORKING_DIR' in os.environ:
        config.work_dir = '/kaggle/working/' + config.work_dir
    print('working directory:', config.work_dir)

    #save the configuration to the working dir
    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    #Enter the GPUS you have,
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    #our dataset has an explicit validation folder, use that later.
    all_transforms['valid'] = get_transforms(config.transforms.test)

    print("before rajat config", config.data.height, config.data.width)
    #fetch the dataloaders we need
    dataloaders = {
        phase: make_loader(data_folder=config.data.train_dir,
                           df_path=config.data.train_df_path,
                           phase=phase,
                           img_size=(config.data.height, config.data.width),
                           batch_size=config.train.batch_size,
                           num_workers=config.num_workers,
                           idx_fold=config.data.params.idx_fold,
                           transforms=all_transforms[phase],
                           num_classes=config.data.num_classes,
                           pseudo_label_path=config.train.pseudo_label_path,
                           debug=config.debug)
        for phase in ['train', 'valid']
    }

    #creating the segmentation model with pre-trained encoder
    '''
    dumping the parameters for smp library
    encoder_name: str = "resnet34",
    encoder_depth: int = 5,
    encoder_weights: str = "imagenet",
    decoder_use_batchnorm: bool = True,
    decoder_channels: List[int] = (256, 128, 64, 32, 16),
    decoder_attention_type: Optional[str] = None,
    in_channels: int = 3,
    classes: int = 1,
    activation: Optional[Union[str, callable]] = None,
    aux_params: Optional[dict] = None,
    '''
    model = getattr(smp, config.model.arch)(
        encoder_name=config.model.encoder,
        encoder_weights=config.model.pretrained,
        classes=config.data.num_classes,
        activation=None,
    )

    #fetch the loss
    criterion = get_loss(config)
    params = [
        {
            'params': model.decoder.parameters(),
            'lr': config.optimizer.params.decoder_lr
        },
        {
            'params': model.encoder.parameters(),
            'lr': config.optimizer.params.encoder_lr
        },
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)
    '''
    dumping the catalyst supervised runner
    https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py

    model (Model): Torch model object
    device (Device): Torch device
    input_key (str): Key in batch dict mapping for model input
    output_key (str): Key in output dict model output
        will be stored under
    input_target_key (str): Key in batch dict mapping for target
    '''

    runner = SupervisedRunner(model=model, device=get_device())

    #@pavel,srk,rajat,vladimir,pudae check the IOU and the Dice Callbacks

    callbacks = [DiceCallback(), IouCallback()]

    #adding patience
    if config.train.early_stop_patience > 0:
        callbacks.append(
            EarlyStoppingCallback(patience=config.train.early_stop_patience))

    #thanks for handling the distributed training
    '''
    we are gonna take zero_grad after accumulation accumulation_steps
    '''
    if config.train.accumulation_size > 0:
        accumulation_steps = config.train.accumulation_size // config.train.batch_size
        callbacks.extend([
            CriterionCallback(),
            OptimizerCallback(accumulation_steps=accumulation_steps)
        ])

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(
            CheckpointCallback(resume=config.work_dir +
                               '/checkpoints/last_full.pth'))
    '''
    pudae добавь пожалуйста обратный вызов
    https://arxiv.org/pdf/1710.09412.pdf
    **srk adding the mixup callback
    '''
    if config.train.mixup:
        callbacks.append(MixupCallback())
    if config.train.cutmix:
        callbacks.append(CutMixCallback())
    '''@rajat implemented cutmix, a wieghed combination of cutout and mixup '''
    callbacks.append(MixupCallback())
    callbacks.append(CutMixCallback())
    '''
    rajat introducing training loop
    https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py
    take care of the nvidias fp16 precision
    '''
    print(config.work_dir)
    print(config.train.minimize_metric)
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        main_metric=config.train.main_metric,
        minimize_metric=config.train.minimize_metric,
        callbacks=callbacks,
        verbose=True,
        fp16=False,
    )
예제 #30
0
def run(config_file):
    config = load_config(config_file)

    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(data_folder=config.data.train_dir,
                           df_path=config.data.train_df_path,
                           phase=phase,
                           batch_size=config.train.batch_size,
                           num_workers=config.num_workers,
                           idx_fold=config.data.params.idx_fold,
                           transforms=all_transforms[phase],
                           num_classes=config.data.num_classes,
                           pseudo_label_path=config.train.pseudo_label_path,
                           debug=config.debug)
        for phase in ['train', 'valid']
    }

    # create segmentation model with pre trained encoder
    model = getattr(smp, config.model.arch)(
        encoder_name=config.model.encoder,
        encoder_weights=config.model.pretrained,
        classes=config.data.num_classes,
        activation=None,
    )

    # train setting
    criterion = get_loss(config)
    params = [
        {
            'params': model.decoder.parameters(),
            'lr': config.optimizer.params.decoder_lr
        },
        {
            'params': model.encoder.parameters(),
            'lr': config.optimizer.params.encoder_lr
        },
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model)

    callbacks = [DiceCallback(), IouCallback()]

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(
            CheckpointCallback(resume=config.work_dir +
                               '/checkpoints/best_full.pth'))

    if config.train.mixup:
        callbacks.append(MixupCallback())

    if config.train.cutmix:
        callbacks.append(CutMixCallback())

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        callbacks=callbacks,
        verbose=True,
        fp16=True,
    )