def run(config): train_dir = config.train.dir model = get_model(config).cuda() criterion = get_loss(config) optimizer = get_optimizer(config, model.parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint( model, optimizer, checkpoint) else: last_epoch, step = -1, -1 print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) scheduler = get_scheduler(config, optimizer, last_epoch) # dataloaders = {split:get_dataloader(config, split, get_transform(config, split)) # for split in ['train', 'val']} print(config.data) dataloaders = { 'train': get_train_dataloader(config, get_transform(config)), 'val': get_valid_dataloaders(config)[0] } writer = SummaryWriter(train_dir) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch + 1)
def run(config, folds_dir, balanced): model = get_model(config[MODEL_NAME], config[MODEL_PARAMS]).cuda() criterion = get_loss(config[LOSS_NAME], config[LOSS_PARAMS]) optimizer = get_optimizer(config[OPTIM_NAME], model.parameters(), optimizer_params=config[OPTIM_PARAMS]) last_epoch = -1 scheduler = get_scheduler(config[SCHEDULER_NAME], optimizer, last_epoch, config[SCHEDULER_PARAMS]) datasets = { stage: CustomDataset(folds_dir, stage, config[FOLD_ID], config[DATA_PREFIX], config[INPUT_SIZE]) for stage in ['train', 'test'] } print('Loading sampler') if balanced: train_sampler = BalancedBatchSampler(datasets['train']) else: train_sampler = None print('Sampler loaded') dataloaders = { stage: get_dataloader(datasets[stage], config[BATCH_SIZE], train_sampler) for stage in ['train', 'test'] } writer = SummaryWriter(config[TRAIN_DIR]) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch + 1)
def run(config_file): config = load_config(config_file) os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, task='cls' ) for phase in ['train', 'valid'] } # create model model = CustomNet(config.model.encoder, config.data.num_classes) # train setting criterion = get_loss(config) params = [ {'params': model.base_params(), 'lr': config.optimizer.params.encoder_lr}, {'params': model.fresh_params(), 'lr': config.optimizer.params.decoder_lr} ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model) callbacks = [MultiClassAccuracyCallback(threshold=0.5), F1ScoreCallback()] if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append(CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth')) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, callbacks=callbacks, verbose=True, fp16=True, )
def run(config): model = get_model(config[MODEL_NAME], config[MODEL_PARAMS]).cuda() criterion = get_loss(config[LOSS_NAME], config[LOSS_PARAMS]) optimizer = get_optimizer(config[OPTIM_NAME], model.parameters(), optimizer_params=config[OPTIM_PARAMS]) last_epoch = -1 scheduler = get_scheduler(config[SCHEDULER_NAME], optimizer, last_epoch, config[SCHEDULER_PARAMS]) datasets = { stage: CustomDataset(DATA_DIR, stage, config[FOLD_ID], config[DATA_PREFIX], config[INPUT_SIZE]) for stage in ['train', 'test'] } dataloaders = { stage: get_dataloader(datasets[stage], config[BATCH_SIZE]) for stage in ['train', 'test'] } writer = SummaryWriter(config[TRAIN_DIR]) clip_grad_value_(model.parameters(), 2.0) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch + 1)
def __init__(self, args, logger): self.args = args self.logger = logger self.writer = SummaryWriter(args.log_dir) cudnn.enabled = True # set up model self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = get_aux_net(args.network.arch)(aux_classes=args.aux_classes + 1, classes=args.n_classes) self.model = self.model.to(self.device) wandb.watch(self.model) if args.mode == 'train': # set up optimizer, lr scheduler and loss functions optimizer = get_optimizer(self.args.training.optimizer) optimizer_params = {k: v for k, v in self.args.training.optimizer.items() if k != "name"} self.optimizer = optimizer(self.model.parameters(), **optimizer_params) self.scheduler = get_scheduler(self.optimizer, self.args.training.lr_scheduler) self.class_loss_func = nn.CrossEntropyLoss() self.start_iter = 0 # resume if args.training.resume: self.load(args.model_dir + '/' + args.training.resume) cudnn.benchmark = True elif args.mode == 'val': self.load(os.path.join(args.model_dir, args.validation.model)) else: self.load(os.path.join(args.model_dir, args.testing.model))
def run(config): train_dir = config.train.dir model = get_model(config, model_type).to(device) print('The nubmer of parameters : %d' % count_parameters(model)) criterion = get_loss(config) optimizer = get_optimizer(config, model) checkpoint = utils.checkpoint.get_initial_checkpoint(config, model_type=model_type) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint( model, optimizer, checkpoint, model_type=model_type) else: last_epoch, step = -1, -1 print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) scheduler = get_scheduler(config, optimizer, last_epoch) print(config.data) dataloaders = { 'train': get_train_dataloader(config), 'val': get_valid_dataloader(config), 'test': get_test_dataloader(config) } writer = SummaryWriter(config.train[model_type + '_dir']) visualizer = get_visualizer(config) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, visualizer, last_epoch + 1)
def run(self): # checkpoint self.scheduler = get_scheduler(self.config, self.optimizer, self.last_epoch) self.model.train() postfix_dic = { 'lr': 0.0, 'acc': 0.0, 'loss': 0.0, } if self.config.data.sampler == "weight": self.train_weigh() else: for epoch in range(self.last_epoch, self.num_epochs): self.train_single_epoch(epoch) if epoch % 200 == 199: save_checkpoint(self.config, self.model, self.optimizer, self.optimizer_center, epoch, self.step) self.scheduler.step() if epoch > self.config.train.num_epochs: break
def run(config): teacher_model = get_model(config, 'teacher').to(device) criterion = get_loss(config) # for teacher trainable_params = filter(lambda p: p.requires_grad, teacher_model.parameters()) optimizer_t = get_optimizer(config, teacher_model.parameters()) checkpoint_t = utils.checkpoint.get_initial_checkpoint( config, model_type='teacher') if checkpoint_t is not None: last_epoch_t, step_t = utils.checkpoint.load_checkpoint( teacher_model, optimizer_t, checkpoint_t, model_type='teacher') else: last_epoch_t, step_t = -1, -1 print('teacher model from checkpoint: {} last epoch:{}'.format( checkpoint_t, last_epoch_t)) scheduler_t = get_scheduler(config, optimizer_t, last_epoch_t) print(config.data) dataloaders = { 'train': get_train_dataloader(config), 'val': get_valid_dataloader(config), 'test': get_test_dataloader(config) } writer = SummaryWriter(config.train['teacher' + '_dir']) visualizer = get_visualizer(config) train(config, teacher_model, dataloaders, criterion, optimizer_t, scheduler_t, writer, visualizer, last_epoch_t + 1)
def run(config): train_dir = config.train.dir task = get_task(config) optimizer = get_optimizer(config, task.get_model().parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint( task.get_model(), optimizer, checkpoint) else: last_epoch, step = -1, -1 print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) scheduler = get_scheduler(config, optimizer, last_epoch) preprocess_opt = task.get_preprocess_opt() dataloaders = { split: get_dataloader(config, split, get_transform(config, split, **preprocess_opt)) for split in ['train', 'dev'] } writer = SummaryWriter(config.train.dir) train(config, task, dataloaders, optimizer, scheduler, writer, last_epoch + 1)
def run(config): train_dir = config.train.dir student_model = get_model(config, model_type).to(device) criterion = get_loss(config) trainable_params = filter(lambda p: p.requires_grad, student_model.parameters()) optimizer = get_optimizer(config, trainable_params) checkpoint = utils.checkpoint.get_initial_checkpoint(config, model_type=model_type) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint( student_model, optimizer, checkpoint, model_type=model_type) else: last_epoch, step = -1, -1 print('student model from checkpoint: {} last epoch:{}'.format( checkpoint, last_epoch)) scheduler = get_scheduler(config, optimizer, last_epoch) print(config.data) dataloaders = { 'train': get_train_dataloader(config), 'val': get_valid_dataloader(config), 'test': get_test_dataloader(config) } writer = SummaryWriter(config.train['student' + '_dir']) visualizer = get_visualizer(config) train(config, student_model, dataloaders, criterion, optimizer, scheduler, writer, visualizer, last_epoch + 1)
def _get_scheduler(self, optimizer): scheduler = get_scheduler( optimizer, self.config['runner']['total_steps'], self.config['scheduler'] ) self._load_weight(scheduler, 'Scheduler') return scheduler
def __init__(self, config): """Initialize Trainer Args: config (dict): Configuration dictionary """ super(Trainer, self).__init__() # Define multi-task setting dataset = config['dataset'] dataset_name = dataset['dataset_name'] self.tasks_weighting = dataset['tasks_weighting'] self.tasks = [k for k, v in self.tasks_weighting.items()] # Setup network model_config = config['model'] self.model = get_module(model_config, dataset_name, self.tasks) print('Model constructed for {}'.format(' '.join(self.tasks))) if 'grouping' in model_config: print('groups = {}'.format(model_config['grouping']['groups'])) print('grouping method = {}'.format(model_config['grouping']['method'])) self.model = update_module(config, self.model, self.tasks) # Setup for a task-conditional setting model_params = config['model']['parameters'] if 'common_mt_params' in model_params: self.task_conditional = not model_params['common_mt_params'] else: self.task_conditional = False # Setup optimizers optimizer_config = config['optimizer'] optimizer_cls = get_optimizer(optimizer_config['algorithm']) model_params = get_params(self.model, optimizer_config['parameters']['lr'], len(self.tasks), self.task_conditional, self.tasks) self.optimizer = optimizer_cls(model_params, **optimizer_config['parameters']) # Setup schedulers scheduler_config = config['scheduler'] scheduler_cls = get_scheduler(scheduler_config['lr_policy']) self.scheduler = scheduler_cls(self.optimizer, **scheduler_config['parameters']) # Setup loss function losses_config = config['loss'] self.criterions = get_loss_functions(self.tasks, losses_config) # Initialise performance meters self.best_val_loss = 1e9 self.train_loss = {} self.val_loss = {} for task in self.tasks: self.train_loss[task] = get_running_meter() self.val_loss[task] = get_running_meter() # Initialize img logging for visualization self.img_logging = get_img_logging(dataset_name, self.tasks) self.pred_decoder = get_pred_decoder(dataset_name, self.tasks)
def _get_scheduler(self, optimizer): scheduler = get_scheduler(optimizer, self.config['runner']['total_steps'], self.config['scheduler']) init_scheduler = self.init_ckpt.get('Scheduler') if init_scheduler: print( '[Runner] - Loading scheduler weights from the previous experiment' ) scheduler.load_state_dict(init_scheduler) return scheduler
def search_once(config, policy): model = get_model(config).cuda() criterion = get_loss(config) optimizer = get_optimizer(config, model.parameters()) scheduler = get_scheduler(config, optimizer, -1) transforms = {'train': get_transform(config, 'train', params={'policies': policy}), 'val': get_transform(config, 'val')} dataloaders = {split:get_dataloader(config, split, transforms[split]) for split in ['train', 'val']} score_dict = train(config, model, dataloaders, criterion, optimizer, scheduler, None, 0) return score_dict['f1_mavg']
def run(config): teacher_model = get_model(config, 'teacher').to(device) student_model = get_model(config, 'student').to(device) print('The nubmer of parameters : %d'%count_parameters(student_model)) criterion = get_loss(config) # for teacher optimizer_t = None checkpoint_t = utils.checkpoint.get_initial_checkpoint(config, model_type='teacher') if checkpoint_t is not None: last_epoch_t, step_t = utils.checkpoint.load_checkpoint(teacher_model, optimizer_t, checkpoint_t, model_type='teacher') else: last_epoch_t, step_t = -1, -1 print('teacher model from checkpoint: {} last epoch:{}'.format( checkpoint_t, last_epoch_t)) # for student optimizer_s = get_optimizer(config, student_model) checkpoint_s = utils.checkpoint.get_initial_checkpoint(config, model_type='student') if checkpoint_s is not None: last_epoch_s, step_s = utils.checkpoint.load_checkpoint(student_model, optimizer_s, checkpoint_s, model_type='student') else: last_epoch_s, step_s = -1, -1 print('student model from checkpoint: {} last epoch:{}'.format( checkpoint_s, last_epoch_s)) scheduler_s = get_scheduler(config, optimizer_s, last_epoch_s) print(config.data) dataloaders = {'train':get_train_dataloader(config, get_transform(config)), 'val':get_valid_dataloader(config)} #'test':get_test_dataloader(config)} writer = SummaryWriter(config.train['student' + '_dir']) visualizer = get_visualizer(config) result = train(config, student_model, teacher_model, dataloaders, criterion, optimizer_s, scheduler_s, writer, visualizer, last_epoch_s+1) print('best psnr : %.3f, best epoch: %d'%(result['best_psnr'], result['best_epoch']))
def run(config): train_group_csv_dir = './data/group_csv/' writer = SummaryWriter(config.train.dir) train_filenames = list(glob.glob(os.path.join(train_group_csv_dir, 'data_train_group_*')))[1:] for ti, train_file in tqdm.tqdm(enumerate(train_filenames)): gi_tr = train_file.replace('data_train_group_', '') gi_tr = gi_tr.split('/')[-1] gi_tr = gi_tr.replace('.csv', '') group_idx = int(gi_tr) utils.prepare_train_directories(config, group_idx) model = get_model(config, group_idx) if torch.cuda.is_available(): model = model.cuda() criterion = get_loss(config) optimizer = get_optimizer(config, model.parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config, group_idx) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint(model, optimizer, checkpoint) else: last_epoch, step = -1, -1 if last_epoch > config.train.num_epochs: print('group -- ', str(group_idx), '-- index-', ti, ' ----已xl,跳过') continue print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) print('group -- ', str(group_idx), '-- index-', ti) scheduler = get_scheduler(config, optimizer, last_epoch) dataloaders = {split:get_dataloader(config, group_idx, split, get_transform(config, split)) for split in ['train', 'val']} train(config,group_idx, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch+1)
def __init__(self, cfg, writer, logger, use_pseudo_label=False, modal_num=3, multimodal_merger=multimodal_merger): self.cfg = cfg self.writer = writer self.class_numbers = 19 self.logger = logger cfg_model = cfg['model'] self.cfg_model = cfg_model self.best_iou = -100 self.iter = 0 self.nets = [] self.split_gpu = 0 self.default_gpu = cfg['model']['default_gpu'] self.PredNet_Dir = None self.valid_classes = cfg['training']['valid_classes'] self.G_train = True self.cls_feature_weight = cfg['training']['cls_feature_weight'] self.use_pseudo_label = use_pseudo_label self.modal_num = modal_num # cluster vectors & cuda initialization self.objective_vectors_group = torch.zeros(self.modal_num + 1, 19, 256).cuda() self.objective_vectors_num_group = torch.zeros(self.modal_num + 1, 19).cuda() self.objective_vectors_dis_group = torch.zeros(self.modal_num + 1, 19, 19).cuda() self.class_threshold_group = torch.full([self.modal_num + 1, 19], 0.6).cuda() self.disc_T = torch.FloatTensor([0.0]).cuda() #self.metrics = CustomMetrics(self.class_numbers) self.metrics = CustomMetrics(self.class_numbers, modal_num=self.modal_num, model=self) # multimodal / multi-branch merger self.multimodal_merger = multimodal_merger bn = cfg_model['bn'] if bn == 'sync_bn': BatchNorm = SynchronizedBatchNorm2d elif bn == 'bn': BatchNorm = nn.BatchNorm2d elif bn == 'gn': BatchNorm = nn.GroupNorm else: raise NotImplementedError('batch norm choice {} is not implemented'.format(bn)) if True: self.PredNet = DeepLab( num_classes=19, backbone=cfg_model['basenet']['version'], output_stride=16, bn=cfg_model['bn'], freeze_bn=True, modal_num=self.modal_num ).cuda() self.load_PredNet(cfg, writer, logger, dir=None, net=self.PredNet) self.PredNet_DP = self.init_device(self.PredNet, gpu_id=self.default_gpu, whether_DP=True) self.PredNet.eval() self.PredNet_num = 0 self.PredDnet = FCDiscriminator(inplanes=19) self.load_PredDnet(cfg, writer, logger, dir=None, net=self.PredDnet) self.PredDnet_DP = self.init_device(self.PredDnet, gpu_id=self.default_gpu, whether_DP=True) self.PredDnet.eval() self.BaseNet = DeepLab( num_classes=19, backbone=cfg_model['basenet']['version'], output_stride=16, bn=cfg_model['bn'], freeze_bn=True, modal_num=self.modal_num ) logger.info('the backbone is {}'.format(cfg_model['basenet']['version'])) self.BaseNet_DP = self.init_device(self.BaseNet, gpu_id=self.default_gpu, whether_DP=True) self.nets.extend([self.BaseNet]) self.nets_DP = [self.BaseNet_DP] # Discriminator self.SOURCE_LABEL = 0 self.TARGET_LABEL = 1 self.DNets = [] self.DNets_DP = [] for _ in range(self.modal_num+1): _net_d = FCDiscriminator(inplanes=19) self.DNets.append(_net_d) _net_d_DP = self.init_device(_net_d, gpu_id=self.default_gpu, whether_DP=True) self.DNets_DP.append(_net_d_DP) self.nets.extend(self.DNets) self.nets_DP.extend(self.DNets_DP) self.optimizers = [] self.schedulers = [] optimizer_cls = torch.optim.SGD optimizer_params = {k:v for k, v in cfg['training']['optimizer'].items() if k != 'name'} optimizer_cls_D = torch.optim.Adam optimizer_params_D = {k:v for k, v in cfg['training']['optimizer_D'].items() if k != 'name'} if False: self.BaseOpti = optimizer_cls(self.BaseNet.parameters(), **optimizer_params) else: self.BaseOpti = optimizer_cls(self.BaseNet.optim_parameters(cfg['training']['optimizer']['lr']), **optimizer_params) self.optimizers.extend([self.BaseOpti]) self.DiscOptis = [] for _d_net in self.DNets: self.DiscOptis.append( optimizer_cls_D(_d_net.parameters(), **optimizer_params_D) ) self.optimizers.extend(self.DiscOptis) self.schedulers = [] if False: self.BaseSchedule = get_scheduler(self.BaseOpti, cfg['training']['lr_schedule']) self.schedulers.extend([self.BaseSchedule]) else: """BaseSchedule detail see FUNC: scheduler_step()""" self.learning_rate = cfg['training']['optimizer']['lr'] self.gamma = cfg['training']['lr_schedule']['gamma'] self.num_steps = cfg['training']['lr_schedule']['max_iter'] self._BaseSchedule_nouse = get_scheduler(self.BaseOpti, cfg['training']['lr_schedule']) self.schedulers.extend([self._BaseSchedule_nouse]) self.DiscSchedules = [] for _disc_opt in self.DiscOptis: self.DiscSchedules.append( get_scheduler(_disc_opt, cfg['training']['lr_schedule']) ) self.schedulers.extend(self.DiscSchedules) self.setup(cfg, writer, logger) self.adv_source_label = 0 self.adv_target_label = 1 self.bceloss = nn.BCEWithLogitsLoss(reduce=False) self.loss_fn = get_loss_function(cfg) pseudo_cfg = copy.deepcopy(cfg) pseudo_cfg['training']['loss']['name'] = 'cross_entropy4d' self.pseudo_loss_fn = get_loss_function(pseudo_cfg) self.mseloss = nn.MSELoss() self.l1loss = nn.L1Loss() self.smoothloss = nn.SmoothL1Loss() self.triplet_loss = nn.TripletMarginLoss() self.kl_distance = nn.KLDivLoss(reduction='none')
def run() -> float: np.random.seed(0) model_dir = config.experiment_dir logger.info('=' * 50) # logger.info(f'hyperparameters: {params}') train_loader, val_loader, test_loader, label_encoder = load_data(args.fold) model = create_model() optimizer = get_optimizer(config, model.parameters()) lr_scheduler = get_scheduler(config, optimizer) lr_scheduler2 = get_scheduler( config, optimizer) if config.scheduler2.name else None criterion = get_loss(config) if args.weights is None: last_epoch = 0 logger.info(f'training will start from epoch {last_epoch+1}') else: last_checkpoint = torch.load(args.weights) assert last_checkpoint['arch'] == config.model.arch model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint {args.weights} was loaded.') last_epoch = last_checkpoint['epoch'] logger.info(f'loaded the model from epoch {last_epoch}') if args.lr_override != 0: set_lr(optimizer, float(args.lr_override)) elif 'lr' in config.scheduler.params: set_lr(optimizer, config.scheduler.params.lr) if args.gen_predict: print('inference mode') generate_submission(val_loader, test_loader, model, label_encoder, last_epoch, args.weights) sys.exit(0) if args.gen_features: print('inference mode') generate_features(test_loader, model, args.weights) sys.exit(0) best_score = 0.0 best_epoch = 0 last_lr = get_lr(optimizer) best_model_path = args.weights for epoch in range(last_epoch + 1, config.train.num_epochs + 1): logger.info('-' * 50) # if not is_scheduler_continuous(config.scheduler.name): # # if we have just reduced LR, reload the best saved model # lr = get_lr(optimizer) # logger.info(f'learning rate {lr}') # # if lr < last_lr - 1e-10 and best_model_path is not None: # last_checkpoint = torch.load(os.path.join(model_dir, best_model_path)) # assert(last_checkpoint['arch']==config.model.arch) # model.load_state_dict(last_checkpoint['state_dict']) # optimizer.load_state_dict(last_checkpoint['optimizer']) # logger.info(f'checkpoint {best_model_path} was loaded.') # set_lr(optimizer, lr) # last_lr = lr # # if lr < config.train.min_lr * 1.01: # logger.info('reached minimum LR, stopping') # break get_lr(optimizer) train(train_loader, model, criterion, optimizer, epoch, lr_scheduler, lr_scheduler2) score = validate(val_loader, model, epoch) if not is_scheduler_continuous(config.scheduler.name): lr_scheduler.step(score) if lr_scheduler2 and not is_scheduler_continuous( config.scheduler.name): lr_scheduler2.step(score) is_best = score > best_score best_score = max(score, best_score) if is_best: best_epoch = epoch data_to_save = { 'epoch': epoch, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'best_score': best_score, 'score': score, 'optimizer': optimizer.state_dict(), 'options': config } filename = config.version if is_best: best_model_path = f'{filename}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth' save_checkpoint(data_to_save, best_model_path, model_dir) logger.info(f'best score: {best_score:.04f}') return -best_score
def train(cfg, writer, logger): # This statement must be declared before using pytorch use_cuda = False if cfg.get("cuda", None) is not None: if cfg.get("cuda", None) != "all": os.environ["CUDA_VISIBLE_DEVICES"] = cfg.get("cuda", None) use_cuda = torch.cuda.is_available() # Setup random seed seed = cfg["training"].get("seed", random.randint(1, 10000)) torch.manual_seed(seed) if use_cuda: torch.cuda.manual_seed(seed) np.random.seed(seed) random.seed(seed) # Setup Dataloader train_loader, val_loader = get_loader(cfg) # Setup Model model = get_model(cfg) # writer.add_graph(model, torch.rand([1, 3, 224, 224])) if use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model, device_ids=list( range(torch.cuda.device_count()))) # Setup optimizer, lr_scheduler and loss function optimizer = get_optimizer(model.parameters(), cfg) scheduler = get_scheduler(optimizer, cfg) loss_fn = get_loss_fn(cfg) # Setup Metrics epochs = cfg["training"]["epochs"] recorder = RecorderMeter(epochs) start_epoch = 0 # save model parameters every <n> epochs save_interval = cfg["training"]["save_interval"] if use_cuda: model.cuda() loss_fn.cuda() # Resume Trained Model resume_path = os.path.join(writer.file_writer.get_logdir(), cfg["training"]["resume"]) best_path = os.path.join(writer.file_writer.get_logdir(), cfg["training"]["best_model"]) if cfg["training"]["resume"] is not None: if os.path.isfile(resume_path): logger.info( "Loading model and optimizer from checkpoint '{}'".format( resume_path)) checkpoint = torch.load(resume_path) state = checkpoint["state_dict"] if torch.cuda.device_count() <= 1: state = convert_state_dict(state) model.load_state_dict(state) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) start_epoch = checkpoint["epoch"] recorder = checkpoint['recorder'] logger.info("Loaded checkpoint '{}' (epoch {})".format( resume_path, checkpoint["epoch"])) else: logger.info("No checkpoint found at '{}'".format(resume_path)) epoch_time = AverageMeter() for epoch in range(start_epoch, epochs): start_time = time.time() need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg * (epochs - epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format( need_hour, need_mins, need_secs) logger.info( '\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:8.6f}]'. format(time_string(), epoch, epochs, need_time, optimizer. param_groups[0]['lr']) + # scheduler.get_last_lr() >=1.4 ' [Best : Accuracy={:.2f}]'.format(recorder.max_accuracy(False))) train_acc, train_los = train_epoch(train_loader, model, loss_fn, optimizer, use_cuda, logger) val_acc, val_los = validate_epoch(val_loader, model, loss_fn, use_cuda, logger) scheduler.step() is_best = recorder.update(epoch, train_los, train_acc, val_los, val_acc) if is_best or epoch % save_interval == 0 or epoch == epochs - 1: # save model (resume model and best model) save_checkpoint( { 'epoch': epoch + 1, 'recorder': recorder, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, best_path, resume_path) for name, param in model.named_parameters(): # save histogram writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch) writer.add_scalar('Train/loss', train_los, epoch) # save curves writer.add_scalar('Train/acc', train_acc, epoch) writer.add_scalar('Val/loss', val_los, epoch) writer.add_scalar('Val/acc', val_acc, epoch) epoch_time.update(time.time() - start_time) writer.close()
def train(cfg): # Setup seeds torch.manual_seed(cfg.get('seed', 1337)) torch.cuda.manual_seed(cfg.get('seed', 1337)) np.random.seed(cfg.get('seed', 1337)) random.seed(cfg.get('seed', 1337)) # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Setup Augmentations augmentations = cfg['training'].get('augmentations', None) data_aug = get_composed_augmentations(augmentations) # Setup Dataloader data_loader = get_loader(cfg['data']['dataset']) data_path = cfg['data']['path'] t_loader = data_loader( data_path, is_transform=True, split=cfg['data']['train_split'], #img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']), augmentations=data_aug) v_loader = data_loader( data_path, is_transform=True, split=cfg['data']['val_split'], #img_size=(cfg['data']['img_rows'], cfg['data']['img_cols']), ) n_classes = t_loader.n_classes trainloader = data.DataLoader(t_loader, batch_size=cfg['training']['batch_size'], num_workers=cfg['training']['n_workers'], shuffle=True) valloader = data.DataLoader(v_loader, batch_size=cfg['training']['batch_size'], num_workers=cfg['training']['n_workers']) # Setup Metrics running_metrics_val = runningScore(n_classes) # Setup Model model = get_model(cfg['model'], n_classes).to(device) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = {k:v for k, v in cfg['training']['optimizer'].items() if k != 'name'} optimizer = optimizer_cls(model.parameters(), **optimizer_params) scheduler = get_scheduler(optimizer, cfg['training']['lr_schedule']) loss_fn = get_loss_function(cfg) start_iter = 0 if cfg['training']['resume'] is not None: if os.path.isfile(cfg['training']['resume']): checkpoint = torch.load(cfg['training']['resume']) model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] print("=====>", "Loaded checkpoint '{}' (iter {})".format( cfg['training']['resume'], checkpoint["epoch"] ) ) else: print("=====>","No checkpoint found at '{}'".format(cfg['training']['resume'])) val_loss_meter = averageMeter() time_meter = averageMeter() best_iou = -100.0 i = start_iter flag = True while i <= cfg['training']['train_iters'] and flag: for (images, labels) in trainloader: i += 1 start_ts = time.time() scheduler.step() model.train() images = images.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(images) loss = loss_fn(input=outputs, target=labels) loss.backward() optimizer.step() time_meter.update(time.time() - start_ts) if (i + 1) % cfg['training']['print_interval'] == 0: fmt_str = "Iter [{:d}/{:d}] Loss: {:.4f} Time/Image: {:.4f}" print_str = fmt_str.format(i + 1, cfg['training']['train_iters'], loss.item(), time_meter.avg / cfg['training']['batch_size']) print(print_str) time_meter.reset() if (i + 1) % cfg['training']['val_interval'] == 0 or \ (i + 1) == cfg['training']['train_iters']: model.eval() with torch.no_grad(): for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)): images_val = images_val.to(device) labels_val = labels_val.to(device) outputs = model(images_val) val_loss = loss_fn(input=outputs, target=labels_val) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() running_metrics_val.update(gt, pred) val_loss_meter.update(val_loss.item()) print("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg)) score, class_iou = running_metrics_val.get_scores() for k, v in score.items(): print(k,':',v) for k, v in class_iou.items(): print('{}: {}'.format(k, v)) val_loss_meter.reset() running_metrics_val.reset() if score["Mean IoU : \t"] >= best_iou: best_iou = score["Mean IoU : \t"] state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_iou": best_iou, } save_path = os.path.join('./checkpoint', "{}_{}_best_model.pkl".format( cfg['model']['arch'], cfg['data']['dataset'])) print("saving···") torch.save(state, save_path) if (i + 1) == cfg['training']['train_iters']: flag = False break
def train(cfg, logger): # Setup Seeds torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup Device device = torch.device("cuda:{}".format(cfg["training"]["gpu_idx"]) if torch.cuda.is_available() else "cpu") # Setup Augmentations augmentations = cfg["training"].get("augmentations", None) # Setup Dataloader data_loader = get_loader(cfg["data"]["dataset"]) data_path = cfg["data"]["path"] t_loader = data_loader( data_path, split=cfg["data"]["train_split"], ) v_loader = data_loader( data_path, split=cfg["data"]["val_split"], ) n_classes = t_loader.n_classes n_val = len(v_loader.files['val']) trainloader = data.DataLoader( t_loader, batch_size=cfg["training"]["batch_size"], num_workers=cfg["training"]["n_workers"], shuffle=True, ) valloader = data.DataLoader(v_loader, batch_size=cfg["training"]["batch_size"], num_workers=cfg["training"]["n_workers"]) # Setup Metrics running_metrics_val = runningScore(n_classes, n_val) # Setup Model model = get_model(cfg["model"], n_classes).to(device) model = torch.nn.DataParallel(model, device_ids=[cfg["training"]["gpu_idx"]]) # Setup Optimizer, lr_scheduler and Loss Function optimizer_cls = get_optimizer(cfg) optimizer_params = { k: v for k, v in cfg["training"]["optimizer"].items() if k != "name" } optimizer = optimizer_cls(model.parameters(), **optimizer_params) logger.info("Using optimizer {}".format(optimizer)) scheduler = get_scheduler(optimizer, cfg["training"]["lr_schedule"]) loss_fn = get_loss_function(cfg) logger.info("Using loss {}".format(loss_fn)) # Resume Trained Model if cfg["training"]["resume"] is not None: if os.path.isfile(cfg["training"]["resume"]): logger.info( "Loading model and optimizer from checkpoint '{}'".format( cfg["training"]["resume"])) checkpoint = torch.load(cfg["training"]["resume"]) model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] logger.info("Loaded checkpoint '{}' (iter {})".format( cfg["training"]["resume"], checkpoint["epoch"])) else: logger.info("No checkpoint found at '{}'".format( cfg["training"]["resume"])) # Start Training val_loss_meter = averageMeter() time_meter = averageMeter() start_iter = 0 best_dice = -100.0 i = start_iter flag = True while i <= cfg["training"]["train_iters"] and flag: for (images, labels, img_name) in trainloader: i += 1 start_ts = time.time() scheduler.step() model.train() images = images.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(images) loss = loss_fn(input=outputs, target=labels) loss.backward() optimizer.step() time_meter.update(time.time() - start_ts) # print train loss if (i + 1) % cfg["training"]["print_interval"] == 0: fmt_str = "Iter [{:d}/{:d}] Loss: {:.4f} Time/Image: {:.4f}" print_str = fmt_str.format( i + 1, cfg["training"]["train_iters"], loss.item(), time_meter.avg / cfg["training"]["batch_size"], ) print(print_str) logger.info(print_str) time_meter.reset() # validation if (i + 1) % cfg["training"]["val_interval"] == 0 or ( i + 1) == cfg["training"]["train_iters"]: model.eval() with torch.no_grad(): for i_val, (images_val, labels_val, img_name_val) in tqdm(enumerate(valloader)): images_val = images_val.to(device) labels_val = labels_val.to(device) outputs = model(images_val) val_loss = loss_fn(input=outputs, target=labels_val) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() running_metrics_val.update(gt, pred, i_val) val_loss_meter.update(val_loss.item()) logger.info("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg)) # print val metrics score, class_dice = running_metrics_val.get_scores() for k, v in score.items(): print(k, v) logger.info("{}: {}".format(k, v)) for k, v in class_dice.items(): logger.info("{}: {}".format(k, v)) val_loss_meter.reset() running_metrics_val.reset() # save model if score["Dice : \t"] >= best_dice: best_dice = score["Dice : \t"] state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_dice": best_dice, } save_path = os.path.join( cfg["training"]["model_dir"], "{}_{}.pkl".format(cfg["model"]["arch"], cfg["data"]["dataset"]), ) torch.save(state, save_path) if (i + 1) == cfg["training"]["train_iters"]: flag = False break
def train(cfg, writer, logger): # Setup seeds torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Setup Dataloader trainloader = get_loader(cfg, "train") valloader = get_loader(cfg, "val") n_classes = cfg["data"]["n_classes"] n_channels = cfg["data"]["channels"] # Setup Metrics running_metrics_val = runningScore(n_classes) # Setup Model model = get_model(cfg, n_classes, n_channels).to(device) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = { k: v for k, v in cfg["training"]["optimizer"].items() if k != "name" } optimizer = optimizer_cls(model.parameters(), **optimizer_params) logger.info("Using optimizer {}".format(optimizer)) scheduler = get_scheduler(optimizer, cfg["training"]["lr_schedule"]) loss_fn = get_loss_function(cfg) logger.info("Using loss {}".format(loss_fn)) start_iter = 0 if cfg["training"]["resume"] is not None: if os.path.isfile(cfg["training"]["resume"]): logger.info( "Loading model and optimizer from checkpoint '{}'".format( cfg["training"]["resume"])) checkpoint = torch.load(cfg["training"]["resume"]) model.module.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] logger.info("Loaded checkpoint '{}' (iter {})".format( cfg["training"]["resume"], checkpoint["epoch"])) else: logger.info("No checkpoint found at '{}'".format( cfg["training"]["resume"])) val_loss_meter = averageMeter() time_meter = averageMeter() best_iou = -100.0 i = start_iter flag = True # fig = plt.figure() # plt.rcParams['xtick.major.pad'] = '15' # fig.show() # fig.canvas.draw() while i <= cfg["training"]["train_iters"] and flag: for (images, labels) in trainloader: i += 1 start_ts = time.time() model.train() images = images.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(images) loss = loss_fn(input=outputs, target=labels) loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 5) # plot_grad_flow(model.named_parameters(), fig) # zero mean conv for layer 1 of dsm encoder optimizer.step() scheduler.step() # m = model._modules['module'].encoderDSM._modules['0']._modules['0'] # model._modules['module'].encoderDSM._modules['0']._modules['0'].weight = m.weight - torch.mean(m.weight) model = zero_mean(model, all=False) time_meter.update(time.time() - start_ts) if (i + 1) % cfg["training"]["print_interval"] == 0: fmt_str = "Iter [{:d}/{:d}] Loss: {:.4f} Time/Image: {:.4f}" print_str = fmt_str.format( i + 1, cfg["training"]["train_iters"], loss.item(), time_meter.avg / cfg["training"]["batch_size"], ) print(print_str) logger.info(print_str) writer.add_scalar("loss/train_loss", loss.item(), i + 1) time_meter.reset() if (i + 1) % cfg["training"]["val_interval"] == 0 or ( i + 1) == cfg["training"]["train_iters"]: model.eval() with torch.no_grad(): for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)): images_val = images_val.to(device) labels_val = labels_val.to(device) outputs = model(images_val) val_loss = loss_fn(input=outputs, target=labels_val) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() # plt.imshow(v_loader.decode_segmap(gt[0,:,:])) # plt.imshow(v_loader.decode_segmap(pred[0, :, :])) running_metrics_val.update(gt, pred) val_loss_meter.update(val_loss.item()) writer.add_scalar("loss/val_loss", val_loss_meter.avg, i + 1) logger.info("Iter %d Loss: %.4f" % (i + 1, val_loss_meter.avg)) score, class_iou = running_metrics_val.get_scores() for k, v in score.items(): #print(k, v) logger.info("{}: {}".format(k, v)) writer.add_scalar("val_metrics/{}".format(k), v, i + 1) for k, v in class_iou.items(): logger.info("{}: {}".format(k, v)) writer.add_scalar("val_metrics/cls_{}".format(k), v, i + 1) val_loss_meter.reset() running_metrics_val.reset() if score["Mean IoU : \t"] >= best_iou: best_iou = score["Mean IoU : \t"] state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_iou": best_iou, } save_path = os.path.join( writer.file_writer.get_logdir(), "{}_{}_best_model.pkl".format(cfg["model"]["arch"], cfg["data"]["dataset"]), ) torch.save(state, save_path) if (i + 1) == cfg["training"]["train_iters"]: flag = False break
def main(): # args = parse_args() IMAGE_PATH = 'data/images/' num_classes_1 = 168 num_classes_2 = 11 num_classes_3 = 7 stats = (0.0692, 0.2051) train_df = pd.read_csv('data/train_with_folds.csv') # train_df = train_df.set_index(['image_id']) # train_df = train_df.drop(['grapheme'], axis=1) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) # Data Loaders # df_train, df_val = train_test_split(train_df, test_size=0.2, random_state=2021) # train_transform = get_transform(128) train_transform = A.Compose([ A.CoarseDropout(max_holes=1, max_width=64, max_height=64, p=0.9), A.ShiftScaleRotate(rotate_limit=5, p=0.9), A.Normalize(mean=stats[0], std=stats[1], always_apply=True) ]) val_transform = A.Compose( [A.Normalize(mean=stats[0], std=stats[1], always_apply=True)]) BATCH_SIZE = 50 folds = [{ 'train': [1, 2, 3, 4], 'val': [0] }, { 'train': [0, 2, 3, 4], 'val': [1] }, { 'train': [1, 0, 3, 4], 'val': [2] }, { 'train': [1, 2, 0, 4], 'val': [3] }, { 'train': [1, 2, 3, 0], 'val': [4] }] # Loop over folds for fld in range(1): fld = 4 print(f'Train fold: {fld}') train_loader = get_loader(train_df, IMAGE_PATH, folds=folds[fld]['train'], batch_size=BATCH_SIZE, workers=4, shuffle=True, transform=train_transform) val_loader = get_loader(train_df, IMAGE_PATH, folds=folds[fld]['val'], batch_size=BATCH_SIZE, workers=4, shuffle=False, transform=val_transform) # Build Model model = load_model('seresnext50_32x4d', pretrained=True) model = model.cuda() # Optimizer optimizer = get_optimizer(model, lr=.00016) # Loss criterion1 = get_criterion() # Training history = pd.DataFrame() history2 = pd.DataFrame() torch.cuda.empty_cache() gc.collect() best = 0 best2 = 1e10 n_epochs = 100 early_epoch = 0 # Scheduler scheduler = get_scheduler(optimizer, train_loader=train_loader, epochs=n_epochs) # print('Loading previous training...') # state = torch.load('model.pth') # model.load_state_dict(state['model_state']) # best = state['kaggle'] # best2 = state['loss'] # print(f'Loaded model with kaggle score: {best}, loss: {best2}') # optimizer.load_state_dict(state['opt_state']) # scheduler.load_state_dict(state['scheduler_state']) # early_epoch = state['epoch'] + 1 # print(f'Beginning at epoch {early_epoch}') # print('') for epoch in range(n_epochs - early_epoch): epoch += early_epoch torch.cuda.empty_cache() gc.collect() # ################################################################### # ############## TRAINING ########################################### # ################################################################### model.train() total_loss = 0 total_loss_1 = 0 total_loss_2 = 0 total_loss_3 = 0 # ratio = pow(.5,epoch/50) # ratio = 0.7 ratio = 1.0 t = tqdm(train_loader) for batch_idx, (img_batch, y_batch) in enumerate(t): img_batch = img_batch.cuda().float() y_batch = y_batch.cuda().long() optimizer.zero_grad() label1 = y_batch[:, 0] label2 = y_batch[:, 1] label3 = y_batch[:, 2] rand = np.random.rand() if rand < 0.5: images, targets = mixup(img_batch, label1, label2, label3, 0.4) output1, output2, output3 = model(images) l1, l2, l3 = mixup_criterion(output1, output2, output3, targets, rate=ratio) elif rand < 1: images, targets = cutmix(img_batch, label1, label2, label3, 0.4) output1, output2, output3 = model(images) l1, l2, l3 = cutmix_criterion(output1, output2, output3, targets, rate=ratio) # else: # output1,output2,output3 = model(img_batch) # l1, l2, l3 = criterion1(output1,output2,output3, y_batch) loss = l1 * .4 + l2 * .3 + l3 * .3 total_loss += loss total_loss_1 += l1 * .4 total_loss_2 += l2 * .3 total_loss_3 += l3 * .3 t.set_description( f'Epoch {epoch+1}/{n_epochs}, LR: %6f, Ratio: %.4f, Loss: %.4f, Root loss: %.4f, Vowel loss: %.4f, Consonant loss: %.4f' % (optimizer.state_dict()['param_groups'][0]['lr'], ratio, total_loss / (batch_idx + 1), total_loss_1 / (batch_idx + 1), total_loss_2 / (batch_idx + 1), total_loss_3 / (batch_idx + 1))) # t.set_description(f'Epoch {epoch}/{n_epochs}, LR: %6f, Loss: %.4f'%(optimizer.state_dict()['param_groups'][0]['lr'],total_loss/(batch_idx+1))) if history is not None: history.loc[epoch + batch_idx / len(train_loader), 'train_loss'] = loss.data.cpu().numpy() history.loc[ epoch + batch_idx / len(train_loader), 'lr'] = optimizer.state_dict()['param_groups'][0]['lr'] loss.backward() optimizer.step() # if scheduler is not None: # scheduler.step() # ################################################################### # ############## VALIDATION ######################################### # ################################################################### model.eval() loss = 0 preds_1 = [] preds_2 = [] preds_3 = [] tars_1 = [] tars_2 = [] tars_3 = [] with torch.no_grad(): for img_batch, y_batch in val_loader: img_batch = img_batch.cuda().float() y_batch = y_batch.cuda().long() o1, o2, o3 = model(img_batch) l1, l2, l3 = criterion1(o1, o2, o3, y_batch) loss += l1 * .4 + l2 * .3 + l3 * .3 for j in range(len(o1)): preds_1.append(torch.argmax(F.softmax(o1[j]), -1)) preds_2.append(torch.argmax(F.softmax(o2[j]), -1)) preds_3.append(torch.argmax(F.softmax(o3[j]), -1)) for i in y_batch: tars_1.append(i[0].data.cpu().numpy()) tars_2.append(i[1].data.cpu().numpy()) tars_3.append(i[2].data.cpu().numpy()) preds_1 = [p.data.cpu().numpy() for p in preds_1] preds_2 = [p.data.cpu().numpy() for p in preds_2] preds_3 = [p.data.cpu().numpy() for p in preds_3] preds_1 = np.array(preds_1).T.reshape(-1) preds_2 = np.array(preds_2).T.reshape(-1) preds_3 = np.array(preds_3).T.reshape(-1) scores = [] scores.append( sklearn.metrics.recall_score(tars_1, preds_1, average='macro')) scores.append( sklearn.metrics.recall_score(tars_2, preds_2, average='macro')) scores.append( sklearn.metrics.recall_score(tars_3, preds_3, average='macro')) final_score = np.average(scores, weights=[2, 1, 1]) loss /= len(val_loader) if history2 is not None: history2.loc[epoch, 'val_loss'] = loss.cpu().numpy() history2.loc[epoch, 'acc'] = final_score history2.loc[epoch, 'root_acc'] = scores[0] history2.loc[epoch, 'vowel_acc'] = scores[1] history2.loc[epoch, 'consonant_acc'] = scores[2] if scheduler is not None: scheduler.step(final_score) print( f'Dev loss: %.4f, Kaggle: {final_score}, Root acc: {scores[0]}, Vowel acc: {scores[1]}, Consonant acc: {scores[2]}' % (loss)) if epoch > 0: history2['acc'].plot() plt.savefig(f'epoch%03d_{fld}_acc.png' % (epoch + 1)) plt.clf() if loss < best2: best2 = loss print(f'Saving best model... (loss)') torch.save( { 'epoch': epoch, 'loss': loss, 'kaggle': final_score, 'model_state': model.state_dict(), 'opt_state': optimizer.state_dict(), 'scheduler_state': scheduler.state_dict() }, f'model-1_{fld}.pth') if final_score > best: best = final_score print(f'Saving best model... (acc)') torch.save( { 'epoch': epoch, 'loss': loss, 'kaggle': final_score, 'model_state': model.state_dict(), 'opt_state': optimizer.state_dict(), 'scheduler_state': scheduler.state_dict() }, f'model_{fld}.pth')
def __init__(self, config_path, run_dir): self.config_path = coerce_to_path_and_check_exist(config_path) self.run_dir = coerce_to_path_and_create_dir(run_dir) self.logger = get_logger(self.run_dir, name="trainer") self.print_and_log_info( "Trainer initialisation: run directory is {}".format(run_dir)) shutil.copy(self.config_path, self.run_dir) self.print_and_log_info("Config {} copied to run directory".format( self.config_path)) with open(self.config_path) as fp: cfg = yaml.load(fp, Loader=yaml.FullLoader) if torch.cuda.is_available(): type_device = "cuda" nb_device = torch.cuda.device_count() # XXX: set to False when input image sizes are not fixed torch.backends.cudnn.benchmark = cfg["training"].get( "cudnn_benchmark", True) else: type_device = "cpu" nb_device = None self.device = torch.device(type_device) self.print_and_log_info("Using {} device, nb_device is {}".format( type_device, nb_device)) # Datasets and dataloaders self.dataset_kwargs = cfg["dataset"] self.dataset_name = self.dataset_kwargs.pop("name") train_dataset = get_dataset(self.dataset_name)("train", **self.dataset_kwargs) val_dataset = get_dataset(self.dataset_name)("val", **self.dataset_kwargs) self.restricted_labels = sorted( self.dataset_kwargs["restricted_labels"]) self.n_classes = len(self.restricted_labels) + 1 self.is_val_empty = len(val_dataset) == 0 self.print_and_log_info("Dataset {} instantiated with {}".format( self.dataset_name, self.dataset_kwargs)) self.print_and_log_info( "Found {} classes, {} train samples, {} val samples".format( self.n_classes, len(train_dataset), len(val_dataset))) self.batch_size = cfg["training"]["batch_size"] self.n_workers = cfg["training"]["n_workers"] self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.n_workers, shuffle=True) self.val_loader = DataLoader(val_dataset, batch_size=self.batch_size, num_workers=self.n_workers) self.print_and_log_info( "Dataloaders instantiated with batch_size={} and n_workers={}". format(self.batch_size, self.n_workers)) self.n_batches = len(self.train_loader) self.n_iterations, self.n_epoches = cfg["training"].get( "n_iterations"), cfg["training"].get("n_epoches") assert not (self.n_iterations is not None and self.n_epoches is not None) if self.n_iterations is not None: self.n_epoches = max(self.n_iterations // self.n_batches, 1) else: self.n_iterations = self.n_epoches * len(self.train_loader) # Model self.model_kwargs = cfg["model"] self.model_name = self.model_kwargs.pop("name") model = get_model(self.model_name)(self.n_classes, **self.model_kwargs).to(self.device) self.model = torch.nn.DataParallel(model, device_ids=range( torch.cuda.device_count())) self.print_and_log_info("Using model {} with kwargs {}".format( self.model_name, self.model_kwargs)) self.print_and_log_info('Number of trainable parameters: {}'.format( f'{count_parameters(self.model):,}')) # Optimizer optimizer_params = cfg["training"]["optimizer"] or {} optimizer_name = optimizer_params.pop("name", None) self.optimizer = get_optimizer(optimizer_name)(model.parameters(), **optimizer_params) self.print_and_log_info("Using optimizer {} with kwargs {}".format( optimizer_name, optimizer_params)) # Scheduler scheduler_params = cfg["training"].get("scheduler", {}) or {} scheduler_name = scheduler_params.pop("name", None) self.scheduler_update_range = scheduler_params.pop( "update_range", "epoch") assert self.scheduler_update_range in ["epoch", "batch"] if scheduler_name == "multi_step" and isinstance( scheduler_params["milestones"][0], float): n_tot = self.n_epoches if self.scheduler_update_range == "epoch" else self.n_iterations scheduler_params["milestones"] = [ round(m * n_tot) for m in scheduler_params["milestones"] ] self.scheduler = get_scheduler(scheduler_name)(self.optimizer, **scheduler_params) self.cur_lr = -1 self.print_and_log_info("Using scheduler {} with parameters {}".format( scheduler_name, scheduler_params)) # Loss loss_name = cfg["training"]["loss"] self.criterion = get_loss(loss_name)() self.print_and_log_info("Using loss {}".format(self.criterion)) # Pretrained / Resume checkpoint_path = cfg["training"].get("pretrained") checkpoint_path_resume = cfg["training"].get("resume") assert not (checkpoint_path is not None and checkpoint_path_resume is not None) if checkpoint_path is not None: self.load_from_tag(checkpoint_path) elif checkpoint_path_resume is not None: self.load_from_tag(checkpoint_path_resume, resume=True) else: self.start_epoch, self.start_batch = 1, 1 # Train metrics train_iter_interval = cfg["training"].get( "train_stat_interval", self.n_epoches * self.n_batches // 200) self.train_stat_interval = train_iter_interval self.train_time = AverageMeter() self.train_loss = AverageMeter() self.train_metrics_path = self.run_dir / TRAIN_METRICS_FILE with open(self.train_metrics_path, mode="w") as f: f.write( "iteration\tepoch\tbatch\ttrain_loss\ttrain_time_per_img\n") # Val metrics val_iter_interval = cfg["training"].get( "val_stat_interval", self.n_epoches * self.n_batches // 100) self.val_stat_interval = val_iter_interval self.val_loss = AverageMeter() self.val_metrics = RunningMetrics(self.restricted_labels) self.val_current_score = None self.val_metrics_path = self.run_dir / VAL_METRICS_FILE with open(self.val_metrics_path, mode="w") as f: f.write("iteration\tepoch\tbatch\tval_loss\t" + "\t".join(self.val_metrics.names) + "\n")
def __init__(self, cfg, writer, logger): # super(CustomModel, self).__init__() self.cfg = cfg self.writer = writer self.class_numbers = 19 self.logger = logger cfg_model = cfg['model'] self.cfg_model = cfg_model self.best_iou = -100 self.iter = 0 self.nets = [] self.split_gpu = 0 self.default_gpu = cfg['model']['default_gpu'] self.PredNet_Dir = None self.valid_classes = cfg['training']['valid_classes'] self.G_train = True self.objective_vectors = np.zeros([19, 256]) self.objective_vectors_num = np.zeros([19]) self.objective_vectors_dis = np.zeros([19, 19]) self.class_threshold = np.zeros(self.class_numbers) self.class_threshold = np.full([19], 0.95) self.metrics = CustomMetrics(self.class_numbers) self.cls_feature_weight = cfg['training']['cls_feature_weight'] bn = cfg_model['bn'] if bn == 'sync_bn': BatchNorm = SynchronizedBatchNorm2d # elif bn == 'sync_abn': # BatchNorm = InPlaceABNSync elif bn == 'bn': BatchNorm = nn.BatchNorm2d # elif bn == 'abn': # BatchNorm = InPlaceABN elif bn == 'gn': BatchNorm = nn.GroupNorm else: raise NotImplementedError( 'batch norm choice {} is not implemented'.format(bn)) self.PredNet = DeepLab( num_classes=19, backbone=cfg_model['basenet']['version'], output_stride=16, bn=cfg_model['bn'], freeze_bn=True, ).cuda() self.load_PredNet(cfg, writer, logger, dir=None, net=self.PredNet) self.PredNet_DP = self.init_device(self.PredNet, gpu_id=self.default_gpu, whether_DP=True) self.PredNet.eval() self.PredNet_num = 0 self.BaseNet = DeepLab( num_classes=19, backbone=cfg_model['basenet']['version'], output_stride=16, bn=cfg_model['bn'], freeze_bn=False, ) logger.info('the backbone is {}'.format( cfg_model['basenet']['version'])) self.BaseNet_DP = self.init_device(self.BaseNet, gpu_id=self.default_gpu, whether_DP=True) self.nets.extend([self.BaseNet]) self.nets_DP = [self.BaseNet_DP] self.optimizers = [] self.schedulers = [] # optimizer_cls = get_optimizer(cfg) optimizer_cls = torch.optim.SGD optimizer_params = { k: v for k, v in cfg['training']['optimizer'].items() if k != 'name' } # optimizer_cls_D = torch.optim.SGD # optimizer_params_D = {k:v for k, v in cfg['training']['optimizer_D'].items() # if k != 'name'} self.BaseOpti = optimizer_cls(self.BaseNet.parameters(), **optimizer_params) self.optimizers.extend([self.BaseOpti]) self.BaseSchedule = get_scheduler(self.BaseOpti, cfg['training']['lr_schedule']) self.schedulers.extend([self.BaseSchedule]) self.setup(cfg, writer, logger) self.adv_source_label = 0 self.adv_target_label = 1 self.bceloss = nn.BCEWithLogitsLoss(size_average=True) self.loss_fn = get_loss_function(cfg) self.mseloss = nn.MSELoss() self.l1loss = nn.L1Loss() self.smoothloss = nn.SmoothL1Loss() self.triplet_loss = nn.TripletMarginLoss()
def run() -> float: np.random.seed(0) model_dir = config.experiment_dir logger.info('=' * 50) train_loader, val_loader, test_loader = load_data(args.fold) logger.info(f'creating a model {config.model.arch}') model = create_model(config, pretrained=args.weights is None).cuda() criterion = get_loss(config) if args.summary: torchsummary.summary(model, (3, config.model.input_size, config.model.input_size)) if args.lr_finder: optimizer = get_optimizer(config, model.parameters()) lr_finder(train_loader, model, criterion, optimizer) sys.exit() if args.weights is None and config.train.head_only_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.steps} steps') logger.info(f'max_lr will be {config.train.warmup.max_lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer) freeze_layers(model) train_epoch(train_loader, model, criterion, optimizer, 0, warmup_scheduler, None, config.train.warmup.steps) unfreeze_layers(model) if args.weights is None and config.train.enable_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.steps} steps') logger.info(f'max_lr will be {config.train.warmup.max_lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer) train_epoch(train_loader, model, criterion, optimizer, 0, warmup_scheduler, None, config.train.warmup.steps) optimizer = get_optimizer(config, model.parameters()) if args.weights is None: last_epoch = -1 else: last_checkpoint = torch.load(args.weights) model_arch = last_checkpoint['arch'].replace('se_', 'se') if model_arch != config.model.arch: dprint(model_arch) dprint(config.model.arch) assert model_arch == config.model.arch model.load_state_dict(last_checkpoint['state_dict']) if 'optimizer' in last_checkpoint.keys(): optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {args.weights}') last_epoch = last_checkpoint['epoch'] if 'epoch' in last_checkpoint.keys() else 99 logger.info(f'loaded the model from epoch {last_epoch}') if args.lr != 0: set_lr(optimizer, float(args.lr)) elif 'lr' in config.optimizer.params: set_lr(optimizer, config.optimizer.params.lr) elif 'base_lr' in config.scheduler.params: set_lr(optimizer, config.scheduler.params.base_lr) if not args.cosine: lr_scheduler = get_scheduler(config.scheduler, optimizer, last_epoch= (last_epoch if config.scheduler.name != 'cyclic_lr' else -1)) assert config.scheduler2.name == '' lr_scheduler2 = get_scheduler(config.scheduler2, optimizer, last_epoch=last_epoch) \ if config.scheduler2.name else None else: epoch_size = min(len(train_loader), config.train.max_steps_per_epoch) \ * config.train.batch_size set_lr(optimizer, float(config.cosine.start_lr)) lr_scheduler = CosineLRWithRestarts(optimizer, batch_size=config.train.batch_size, epoch_size=epoch_size, restart_period=config.cosine.period, period_inc=config.cosine.period_inc, max_period=config.cosine.max_period) lr_scheduler2 = None if args.predict_oof or args.predict_test: print('inference mode') assert args.weights is not None if args.predict_oof: gen_train_prediction(val_loader, model, last_epoch, args.weights) else: gen_test_prediction(test_loader, model, args.weights) sys.exit() logger.info(f'training will start from epoch {last_epoch + 1}') best_score = 0.0 best_epoch = 0 last_lr = get_lr(optimizer) best_model_path = args.weights for epoch in range(last_epoch + 1, config.train.num_epochs): logger.info('-' * 50) if not is_scheduler_continuous(lr_scheduler) and lr_scheduler2 is None: # if we have just reduced LR, reload the best saved model lr = get_lr(optimizer) if lr < last_lr - 1e-10 and best_model_path is not None: logger.info(f'learning rate dropped: {lr}, reloading') last_checkpoint = torch.load(best_model_path) assert(last_checkpoint['arch']==config.model.arch) model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {best_model_path}') set_lr(optimizer, lr) last_lr = lr if config.train.lr_decay_coeff != 0 and epoch in config.train.lr_decay_milestones: n_cycles = config.train.lr_decay_milestones.index(epoch) + 1 total_coeff = config.train.lr_decay_coeff ** n_cycles logger.info(f'artificial LR scheduler: made {n_cycles} cycles, decreasing LR by {total_coeff}') set_lr(optimizer, config.scheduler.params.base_lr * total_coeff) lr_scheduler = get_scheduler(config.scheduler, optimizer, coeff=total_coeff, last_epoch=-1) # (last_epoch if config.scheduler.name != 'cyclic_lr' else -1)) if isinstance(lr_scheduler, CosineLRWithRestarts): restart = lr_scheduler.epoch_step() if restart: logger.info('cosine annealing restarted, resetting the best metric') best_score = min(config.cosine.min_metric_val, best_score) train_epoch(train_loader, model, criterion, optimizer, epoch, lr_scheduler, lr_scheduler2, config.train.max_steps_per_epoch) score, _, _ = validate(val_loader, model, epoch) if type(lr_scheduler) == ReduceLROnPlateau: lr_scheduler.step(metrics=score) elif not is_scheduler_continuous(lr_scheduler): lr_scheduler.step() if type(lr_scheduler2) == ReduceLROnPlateau: lr_scheduler2.step(metrics=score) elif lr_scheduler2 and not is_scheduler_continuous(lr_scheduler2): lr_scheduler2.step() is_best = score > best_score best_score = max(score, best_score) if is_best: best_epoch = epoch if is_best: best_model_path = os.path.join(model_dir, f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth') data_to_save = { 'epoch': epoch, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'score': score, 'optimizer': optimizer.state_dict(), 'config': config } torch.save(data_to_save, best_model_path) logger.info(f'a snapshot was saved to {best_model_path}') logger.info(f'best score: {best_score:.04f}') return -best_score
def __init__(self, cfg): """Construct a Unet generator Parameters in cfg: input_nc (int) -- the number of channels in input images output_nc (int) -- the number of channels in output images num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7, image of size 128x128 will become of size 1x1 # at the bottleneck ngf (int) -- the number of filters in the last conv layer norm_layer -- normalization layer We construct the U-Net from the innermost layer to the outermost layer. It is a recursive process. """ super(Unet, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") input_nc = cfg['model']['input_nc'] output_nc = cfg['model']['output_nc'] num_downs = cfg['model']['num_downs'] ngf = cfg['model']['ngf'] norm_layer = nn.BatchNorm2d if cfg['model'][ 'norm_layer'] == 'batch' else nn.InstanceNorm2d use_dropout = cfg['model']['use_dropout'] self.hook = [] # construct unet structure unet_block = UnetSkipConnectionBlock( ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer for i in range(num_downs - 5): # add intermediate layers with ngf * 8 filters unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout) # gradually reduce the number of filters from ngf * 8 to ngf unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer) unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer) unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer) self.model = UnetSkipConnectionBlock( output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True) # add the outermost layer self.inputsF = [ Hook(layer) for layer in list(self.modules()) if isinstance(layer, nn.Conv2d) ] self.out = None self.criterion = cross_entropy2d self.loss = None self.optimizer = get_optimizer(self.parameters(), cfg) if cfg["training"]["lr_schedule"] is not None: self.scheduler = get_scheduler(self.optimizer, cfg["training"]["lr_schedule"])
def run(config_file, device_id, idx_fold): os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id) print('info: use gpu No.{}'.format(device_id)) config = load_config(config_file) # for n-folds loop if config.data.params.idx_fold == -1: config.data.params.idx_fold = idx_fold config.work_dir = config.work_dir + '_fold{}'.format(idx_fold) elif config.data.params.idx_fold == 0: original_fold = int(config.work_dir.split('_fold')[1]) if original_fold == idx_fold: raise Exception( 'if you specify fold 0, you should use train.py or resume from fold 1.' ) config.data.params.idx_fold = idx_fold config.work_dir = config.work_dir.split('_fold')[0] + '_fold{}'.format( idx_fold) else: raise Exception('you should use train.py if idx_fold is specified.') print('info: training for fold {}'.format(idx_fold)) if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( df_path=config.data.train_df_path, data_dir=config.data.train_dir, features=config.data.features, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], horizontal_flip=config.train.horizontal_flip, model_scale=config.data.model_scale, debug=config.debug, pseudo_path=config.data.pseudo_path, ) for phase in ['train', 'valid'] } # create segmentation model with pre trained encoder num_features = len(config.data.features) print('info: num_features =', num_features) model = CenterNetFPN( slug=config.model.encoder, num_classes=num_features, ) optimizer = get_optimizer(model, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model, device=get_device()) # train setting criterion, callbacks = get_criterion_and_callback(config) if config.train.early_stop_patience > 0: callbacks.append( EarlyStoppingCallback(patience=config.train.early_stop_patience)) if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend( [OptimizerCallback(accumulation_steps=accumulation_steps)]) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/last_full.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/last_full.pth')) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=config.train.fp16, )
def run(config_file): config = load_config(config_file) #set up the environment flags for working with the KAGGLE GPU OR COLAB_GPU if 'COLAB_GPU' in os.environ: config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir elif 'KAGGLE_WORKING_DIR' in os.environ: config.work_dir = '/kaggle/working/' + config.work_dir print('working directory:', config.work_dir) #save the configuration to the working dir if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') #Enter the GPUS you have, os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) #our dataset has an explicit validation folder, use that later. all_transforms['valid'] = get_transforms(config.transforms.test) print("before rajat config", config.data.height, config.data.width) #fetch the dataloaders we need dataloaders = { phase: make_loader(data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug) for phase in ['train', 'valid'] } #creating the segmentation model with pre-trained encoder ''' dumping the parameters for smp library encoder_name: str = "resnet34", encoder_depth: int = 5, encoder_weights: str = "imagenet", decoder_use_batchnorm: bool = True, decoder_channels: List[int] = (256, 128, 64, 32, 16), decoder_attention_type: Optional[str] = None, in_channels: int = 3, classes: int = 1, activation: Optional[Union[str, callable]] = None, aux_params: Optional[dict] = None, ''' model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) #fetch the loss criterion = get_loss(config) params = [ { 'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr }, { 'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr }, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) ''' dumping the catalyst supervised runner https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py model (Model): Torch model object device (Device): Torch device input_key (str): Key in batch dict mapping for model input output_key (str): Key in output dict model output will be stored under input_target_key (str): Key in batch dict mapping for target ''' runner = SupervisedRunner(model=model, device=get_device()) #@pavel,srk,rajat,vladimir,pudae check the IOU and the Dice Callbacks callbacks = [DiceCallback(), IouCallback()] #adding patience if config.train.early_stop_patience > 0: callbacks.append( EarlyStoppingCallback(patience=config.train.early_stop_patience)) #thanks for handling the distributed training ''' we are gonna take zero_grad after accumulation accumulation_steps ''' if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend([ CriterionCallback(), OptimizerCallback(accumulation_steps=accumulation_steps) ]) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/last_full.pth')) ''' pudae добавь пожалуйста обратный вызов https://arxiv.org/pdf/1710.09412.pdf **srk adding the mixup callback ''' if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) '''@rajat implemented cutmix, a wieghed combination of cutout and mixup ''' callbacks.append(MixupCallback()) callbacks.append(CutMixCallback()) ''' rajat introducing training loop https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py take care of the nvidias fp16 precision ''' print(config.work_dir) print(config.train.minimize_metric) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=False, )
def run(config_file): config = load_config(config_file) if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader(data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug) for phase in ['train', 'valid'] } # create segmentation model with pre trained encoder model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) # train setting criterion = get_loss(config) params = [ { 'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr }, { 'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr }, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model) callbacks = [DiceCallback(), IouCallback()] # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth')) if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, callbacks=callbacks, verbose=True, fp16=True, )