def main(): INPUT_DIM = 20 OUTPUT_DIM = 2 BATCH_SIZE = 10 EPOCH = 60 criterion = nn.CrossEntropyLoss(reduction='sum').to(DEVICE) channelFilter = ChannelFilter(INPUT_DIM, OUTPUT_DIM).to(DEVICE) optim = Adam(channelFilter.parameters(), lr=1e-3, betas=(0.5, 0.99)) dataloader = get_dataloader(dataset_path='S1', train=True, batch_size=BATCH_SIZE, full=True) valid_loader = get_dataloader(dataset_path='S1', train=False, batch_size=5000, shuffle=False) for i in range(EPOCH): # training total_loss = 0 for index, (data, label) in enumerate(dataloader): channelFilter.train() data, label = data.to(DEVICE), label.to(DEVICE) logit = channelFilter(data) loss = criterion(logit, label) / BATCH_SIZE optim.zero_grad() loss.backward() optim.step() total_loss += loss.item() # valid channelFilter.eval() with torch.no_grad(): valid_data, valid_labels = valid_loader.dataset.data, valid_loader.dataset.label valid_data = torch.tensor(valid_data, dtype=torch.float32).to(DEVICE) valid_labels = torch.tensor(valid_labels).to(DEVICE) t_logit = channelFilter(valid_data) t_loss = criterion(t_logit, valid_labels) valid_pred = torch.argmax(t_logit, dim=1).data.cpu().numpy() valid_labels = valid_labels.data.cpu().numpy() acc, p, r, f1 = accuracy_score(valid_labels, valid_pred), precision_score(valid_labels, valid_pred), \ recall_score(valid_labels, valid_pred), f1_score(valid_labels, valid_pred) print( 'iter: {}, train_loss: {:.3f}, valid_loss: {:.3f}, acc: {:.3f}, p: {:.3f}, r: {:.3f}, f1: {:.3f}' .format(i, total_loss / len(dataloader), t_loss, acc, p, r, f1)) torch.save(channelFilter.state_dict(), './channelFilter.pk')
def __init__(self, config): self.config = config self.mode = self.config.mode self.model = get_model(self.config) self.model_name = self.config.model_name self.load_checkpoint() self.use_val = self.config.use_val self.lr = self.config.learning_rate self.epochs = self.config.epochs self.start_epoch = self.config.start_epoch self.batch_size = self.config.batch_size self.checkpoint_path = self.config.checkpoint_path self.pose_mode = self.config.is_load_pose if self.mode == 'train': if self.use_val: self.train_loader, self.val_loader = get_dataloader( self.config) self.n_batch_train = len(self.train_loader) self.n_batch_val = len(self.val_loader) else: self.train_loader = get_dataloader(self.config) self.n_batch_train = len(self.train_loader) elif self.mode == 'test': self.test_loader = get_dataloader(self.config) self.n_batch_test = len(self.test_loader) self.criterion = torch.nn.L1Loss() # self.criterion = CosineLoss() optimizer_cls = getattr(importlib.import_module('torch.optim'), self.config.optimizer) self.optimizer = optimizer_cls(self.model.parameters(), lr=self.lr) # self.scheduler = Lookahead(self.optimizer, k=5, alpha=0.5) self.scheduler = StepLR(self.optimizer, step_size=self) self.best_ae = float('inf') self.best_loss = float('inf') if self.config.wandb: print("Using wandb to log results") name = self.config.prefix + "_" + self.model_name + "_" + self.config.solver wandb.init(project='gaze-estimation', entity=self.config.wandb_entity, name=name) wandb.config.update(config) # use accelerator to run on different devices easily self.model, self.optimizer, self.train_loader = accelerator.prepare( self.model, self.optimizer, self.train_loader)
def __init__(self, args, **kwargs): super(DecTrainer, self).__init__(args, **kwargs) # dataloader self.trainloader = get_dataloader(args, cfg, 'train') # self.trainloader_val = get_dataloader(args, cfg, 'train_voc') self.valloader = get_dataloader(args, cfg, 'val') self.denorm = self.trainloader.dataset.denorm self.use_triplet = args.use_triplet self.loss_3d = args.loss_3d self.normalize_feature = args.normalize_feature self.nclass = get_num_classes(args) self.classNames = get_class_names(args) assert self.nclass == len(self.classNames) - 1 self.classIndex = {} for i, cname in enumerate(self.classNames): self.classIndex[cname] = i # model self.enc = get_model(cfg.NET, num_classes=self.nclass) self.criterion_cls = get_criterion(cfg.NET.LOSS) # optimizer using different LR enc_params = self.enc.parameter_groups(cfg.NET.LR, cfg.NET.WEIGHT_DECAY) self.optim_enc = self.get_optim(enc_params, cfg.NET) # checkpoint management self._define_checkpoint('enc', self.enc, self.optim_enc) self._load_checkpoint(args.resume) self.fixed_batch = None self.fixed_batch_path = args.fixed_batch_path if os.path.isfile(self.fixed_batch_path): print("Loading fixed batch from {}".format(self.fixed_batch_path)) self.fixed_batch = torch.load(self.fixed_batch_path) # using cuda if cfg.NUM_GPUS != 0: self.enc = nn.DataParallel(self.enc) self.criterion_cls = nn.DataParallel(self.criterion_cls) self.enc = self.enc.cuda() self.criterion_cls = self.criterion_cls.cuda() # CHANGE: visual self.visual_times = 0 self.dataset = args.dataset.lower()
def run(config, num_checkpoint, epoch_end, output_filename): task = get_task(config) preprocess_opt = task.get_preprocess_opt() dataloader = get_dataloader(config, 'train', get_transform(config, 'dev', **preprocess_opt)) model = task.get_model() checkpoints = get_checkpoints(config, num_checkpoint, epoch_end) print('checkpoints:') print('\n'.join(checkpoints)) utils.checkpoint.load_checkpoint(model, None, checkpoints[0]) for i, checkpoint in enumerate(checkpoints[1:]): model2 = get_task(config).get_model() last_epoch, _ = utils.checkpoint.load_checkpoint( model2, None, checkpoint) swa.moving_average(model, model2, 1. / (i + 2)) with torch.no_grad(): swa.bn_update(dataloader, model) output_name = '{}.{}.{:03d}'.format(output_filename, num_checkpoint, last_epoch) print('save {}'.format(output_name)) utils.checkpoint.save_checkpoint( config, model, None, 0, 0, name=output_name, weights_dict={'state_dict': model.state_dict()})
def train(self, train_dataset, test_data, output_dir): tracker = LossTracker(output_dir) global_steps = 0 for res_idx, res in enumerate(self.cfg['resolutions']): self.set_optimizers_lr(self.cfg['learning_rates'][res_idx]) batchs_in_phase = self.cfg['phase_lengths'][res_idx] // self.cfg['batch_sizes'][res_idx] dataloader = EndlessDataloader(get_dataloader(train_dataset, self.cfg['batch_sizes'][res_idx], resize=res, device=self.device)) progress_bar = tqdm(range(batchs_in_phase * 2)) for i in progress_bar: alpha = min(1.0, i / batchs_in_phase) # < 1 in the first half and 1 in the second progress_bar.set_description(f"gs-{global_steps}_res-{res_idx}={res}x{res}_alpha-{alpha:.3f}") batch_real_data = dataloader.next() # train discriminator self.D_optimizer.zero_grad() loss_d = self.get_D_loss(batch_real_data, res_idx, alpha) loss_d.backward() self.D_optimizer.step() tracker.update(dict(loss_d=loss_d)) if (1+i) % self.cfg['n_critic'] == 0: # train generator self.G_optimizer.zero_grad() loss_g = self.get_G_loss(batch_real_data, res_idx, alpha) loss_g.backward() self.G_optimizer.step() tracker.update(dict(loss_g=loss_g)) global_steps += 1 if global_steps % self.cfg['dump_imgs_freq'] == 0: self.save_sample(global_steps, tracker, test_data, output_dir, res_idx, alpha) self.save_train_state(os.path.join(output_dir, 'checkpoints', f"ckpt_res-{res_idx}={res}x{res}-end.pt"))
def __init__(self, args): self.args = args self.device = args.device self.start_iter = 1 self.train_iters = args.train_iters # coeffs self.lambda_A = args.lambda_A self.lambda_B = args.lambda_B self.lambda_idt = args.lambda_idt self.dataloader_A, self.dataloader_B = get_dataloader(args) self.D_B, self.G_AB = get_model(args) self.D_A, self.G_BA = get_model(args) self.criterion_GAN = GANLoss(use_lsgan=args.use_lsgan).to(args.device) self.criterion_cycle = nn.L1Loss() self.criterion_idt = nn.L1Loss() self.optimizer_D = torch.optim.Adam( itertools.chain(self.D_B.parameters(), self.D_A.parameters()), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) self.optimizer_G = torch.optim.Adam( itertools.chain(self.G_AB.parameters(), self.G_BA.parameters()), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) self.logger = self.get_logger(args) self.writer = SummaryWriter(args.log_dir) save_args(args.log_dir, args)
def run(config): train_dir = config.train.dir task = get_task(config) optimizer = get_optimizer(config, task.get_model().parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint( task.get_model(), optimizer, checkpoint) else: last_epoch, step = -1, -1 print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) scheduler = get_scheduler(config, optimizer, last_epoch) preprocess_opt = task.get_preprocess_opt() dataloaders = { split: get_dataloader(config, split, get_transform(config, split, **preprocess_opt)) for split in ['train', 'dev'] } writer = SummaryWriter(config.train.dir) train(config, task, dataloaders, optimizer, scheduler, writer, last_epoch + 1)
def run(config): train_dir = config.train.dir model = get_model(config) if torch.cuda.is_available(): model = model.cuda() criterion = get_loss(config) optimizer = get_optimizer(config, model.parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint( model, optimizer, checkpoint) else: last_epoch, step = -1, -1 print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) scheduler = get_scheduler(config, optimizer, last_epoch) dataloaders = { split: get_dataloader(config, split, get_transform(config, split)) for split in ['train', 'val'] } writer = SummaryWriter(config.train.dir) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch + 1)
def inference(config, model, split, src_file, output_path=None): if split == 'test': data_path = '../data/dicom-images-test' else: data_path = '../data/dicom-images-train' dataset = CustomTestDataset(data_path, src_file, split, config[INPUT_SIZE]) dataloader = get_dataloader(dataset, 1) model = model.cuda() model.eval() model = TTAWrapper(model, fliplr_image2mask) with torch.no_grad(): total_step = len(dataloader) for i, (images, id) in tqdm.tqdm(enumerate(dataloader), total=total_step): images = torch.cat(images, dim=0) images = images.cuda() merged_out = model(images) mean_logits = torch.mean(merged_out, dim=0, keepdim=True) np.save(os.path.join(output_path, id[0] + '.npy'), mean_logits.cpu().numpy())
def run(config): model = get_model(config[MODEL_NAME], config[MODEL_PARAMS]).cuda() criterion = get_loss(config[LOSS_NAME], config[LOSS_PARAMS]) optimizer = get_optimizer(config[OPTIM_NAME], model.parameters(), optimizer_params=config[OPTIM_PARAMS]) last_epoch = -1 scheduler = get_scheduler(config[SCHEDULER_NAME], optimizer, last_epoch, config[SCHEDULER_PARAMS]) datasets = { stage: CustomDataset(DATA_DIR, stage, config[FOLD_ID], config[DATA_PREFIX], config[INPUT_SIZE]) for stage in ['train', 'test'] } dataloaders = { stage: get_dataloader(datasets[stage], config[BATCH_SIZE]) for stage in ['train', 'test'] } writer = SummaryWriter(config[TRAIN_DIR]) clip_grad_value_(model.parameters(), 2.0) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch + 1)
def main(): args = arguments() num_templates = 25 # 29 # 25 # aka the number of clusters train_loader, weights_dir = get_dataloader(args.traindata, args, num_templates) model = DetectionModel(num_objects=1, num_templates=num_templates) loss_fn = DetectionCriterion(num_templates) optimizer = optim.SGD(model.learnable_parameters(args.lr), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.resume: checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) # Set the start epoch if it has not been if not args.start_epoch: args.start_epoch = checkpoint['epoch'] scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, last_epoch=args.start_epoch-1) # train and evalute for `epochs` for epoch in range(args.start_epoch, args.epochs): scheduler.step() trainer.train(model, loss_fn, optimizer, train_loader, epoch, save_path=weights_dir)
def run(config, folds_dir, balanced): model = get_model(config[MODEL_NAME], config[MODEL_PARAMS]).cuda() criterion = get_loss(config[LOSS_NAME], config[LOSS_PARAMS]) optimizer = get_optimizer(config[OPTIM_NAME], model.parameters(), optimizer_params=config[OPTIM_PARAMS]) last_epoch = -1 scheduler = get_scheduler(config[SCHEDULER_NAME], optimizer, last_epoch, config[SCHEDULER_PARAMS]) datasets = { stage: CustomDataset(folds_dir, stage, config[FOLD_ID], config[DATA_PREFIX], config[INPUT_SIZE]) for stage in ['train', 'test'] } print('Loading sampler') if balanced: train_sampler = BalancedBatchSampler(datasets['train']) else: train_sampler = None print('Sampler loaded') dataloaders = { stage: get_dataloader(datasets[stage], config[BATCH_SIZE], train_sampler) for stage in ['train', 'test'] } writer = SummaryWriter(config[TRAIN_DIR]) train(config, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch + 1)
def main(): args = parse_args() # set random seed utils.seed_torch(args.seed) # Setup CUDA, GPU if not torch.cuda.is_available(): print("cuda is not available") exit(0) train_loader, valid_loader = datasets.get_dataloader( fold=args.fold, batch_size=args.batch_size, num_workers=args.num_workers) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() model = PandaNet(arch=args.arch) model.to("cuda") metric = ArcMarginProduct(in_features=512, out_features=6).to("cuda") optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 30, 60, 90], gamma=0.5) """ Train the model """ from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_prefix = f'{current_time}_{args.arch}_fold_{args.fold}_{args.tile_size}_{args.num_tiles}' log_dir = os.path.join(configure.TRAINING_LOG_PATH, log_prefix) tb_writer = None if args.log: tb_writer = SummaryWriter(log_dir=log_dir) best_score = 0.0 model_path = os.path.join(configure.MODEL_PATH, f'{args.arch}_fold_{args.fold}_{args.tile_size}_{args.num_tiles}.pth') print(f'training started: {current_time}') for epoch in range(args.epochs): train_loss = train( dataloader=train_loader, model=model, criterion=criterion, metric=metric, optimizer=optimizer) current_time = datetime.now().strftime('%b%d_%H-%M-%S') print(f'training finished: {current_time}')
def inference_single_tta(config, task, preprocess_opt, split, fold, flip, align, ret_dict): config.transform.params.align = align transform = 'test' if split == 'test' else 'all' config.data.params.landmark_ver = fold dataloader = get_dataloader( config, split, get_transform(config, transform, flip=flip, **preprocess_opt)) id_dict = inference(config, task, dataloader, ret_dict) return id_dict
def main(): args = arguments() num_templates = 25 # aka the number of clusters normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) img_transforms = transforms.Compose([transforms.ToTensor(), normalize]) train_loader, _ = get_dataloader(args.traindata, args, num_templates, img_transforms=img_transforms) model = DetectionModel(num_objects=1, num_templates=num_templates) loss_fn = DetectionCriterion(num_templates) # directory where we'll store model weights weights_dir = "weights" if not osp.exists(weights_dir): os.mkdir(weights_dir) # check for CUDA if torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') optimizer = optim.SGD(model.learnable_parameters(args.lr), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.resume: checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) # Set the start epoch if it has not been if not args.start_epoch: args.start_epoch = checkpoint['epoch'] scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, last_epoch=args.start_epoch - 1) # train and evalute for `epochs` for epoch in range(args.start_epoch, args.epochs): scheduler.step() trainer.train(model, loss_fn, optimizer, train_loader, epoch, save_path=weights_dir, device=device)
def dataloader(args): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_transforms = transforms.Compose([transforms.ToTensor(), normalize]) val_loader, templates = get_dataloader(args.dataset, args, train=False, split=args.split, img_transforms=val_transforms) return val_loader, templates
def search_once(config, policy): model = get_model(config).cuda() criterion = get_loss(config) optimizer = get_optimizer(config, model.parameters()) scheduler = get_scheduler(config, optimizer, -1) transforms = {'train': get_transform(config, 'train', params={'policies': policy}), 'val': get_transform(config, 'val')} dataloaders = {split:get_dataloader(config, split, transforms[split]) for split in ['train', 'val']} score_dict = train(config, model, dataloaders, criterion, optimizer, scheduler, None, 0) return score_dict['f1_mavg']
def __init__(self, args, **kwargs): super(DecTrainer, self).__init__(args, **kwargs) # dataloader self.trainloader = get_dataloader(args, cfg, 'train') self.trainloader_val = get_dataloader(args, cfg, 'train_voc') self.valloader = get_dataloader(args, cfg, 'val') self.denorm = self.trainloader.dataset.denorm self.nclass = get_num_classes(args) self.classNames = get_class_names(args)[:-1] assert self.nclass == len(self.classNames) self.classIndex = {} for i, cname in enumerate(self.classNames): self.classIndex[cname] = i # model self.enc = get_model(cfg.GENERATOR, num_classes=self.nclass) self.criterion_cls = get_criterion(cfg.GENERATOR.LOSS) print(self.enc) # optimizer using different LR enc_params = self.enc.parameter_groups(cfg.GENERATOR.LR, cfg.GENERATOR.WEIGHT_DECAY) self.optim_enc = self.get_optim(enc_params, cfg.GENERATOR) # checkpoint management self._define_checkpoint('enc', self.enc, self.optim_enc) self._load_checkpoint(args.resume) self.fixed_batch = None self.fixed_batch_path = args.fixed_batch_path if os.path.isfile(self.fixed_batch_path): print("Loading fixed batch from {}".format(self.fixed_batch_path)) self.fixed_batch = torch.load(self.fixed_batch_path) # using cuda self.enc = nn.DataParallel(self.enc).cuda() self.criterion_cls = nn.DataParallel(self.criterion_cls).cuda()
def run(config, split, checkpoint_name, output_path): train_dir = config.train.dir task = get_task(config) checkpoint = utils.checkpoint.get_checkpoint(config, checkpoint_name) last_epoch, step = utils.checkpoint.load_checkpoint( task.get_model(), None, checkpoint) print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) preprocess_opt = task.get_preprocess_opt() dataloader = get_dataloader(config, split, get_transform(config, split, **preprocess_opt)) df = inference(config, task, dataloader) df.to_csv(output_path, index=False)
def train(self, train_dataset, test_data, test_dataset, output_dir): tracker = LossTracker(output_dir) while self.res_idx < len(self.cfg['resolutions']): res = self.cfg['resolutions'][self.res_idx] self.set_optimizers_lr(self.cfg['learning_rates'][self.res_idx]) batch_size = self.cfg['batch_sizes'][self.res_idx] batchs_in_phase = self.cfg['phase_lengths'][ self.res_idx] // batch_size dataloader = EndlessDataloader( get_dataloader(train_dataset, batch_size, resize=res, device=self.device)) progress_bar = tqdm(range(batchs_in_phase * 2)) for i in progress_bar: # first half of the batchs are fade in phase where alpha < 1. in the second half alpha =1 alpha = min(1.0, i / batchs_in_phase) batch_real_data = dataloader.next() self.perform_train_step(batch_real_data, tracker, log=(i % 10 == 0), calc_scores=(i % 100 == 0), valid_ds=test_dataset, final_resolution_idx=self.res_idx, alpha=alpha) self.train_step += 1 progress_tag = f"gs-{self.train_step}_res-{self.res_idx}={res}x{res}_alpha-{alpha:.2f}" progress_bar.set_description(progress_tag) if self.train_step % self.cfg['dump_imgs_freq'] == 0: tracker.plot() dump_path = os.path.join(output_dir, 'images', f"{progress_tag}.jpg") self.save_sample(dump_path, test_data[0], test_data[1], final_resolution_idx=self.res_idx, alpha=alpha) if self.train_step % self.cfg['checkpoint_freq'] == 0: self.save_train_state( os.path.join(output_dir, 'checkpoints', f"ckpt_{progress_tag}.pt")) self.res_idx += 1 self.save_train_state( os.path.join(output_dir, 'checkpoints', f"ckpt_final.pt"))
def __init__(self, output_dir): # make dir for all kinds of output self.model_dir = os.path.join(output_dir , 'Model') os.makedirs(self.model_dir) self.image_dir = os.path.join(output_dir , 'Image') os.makedirs(self.image_dir) # make dataloader self.dataloader = get_dataloader() # other variables self.batch_size = cfg.TRAIN.BATCH_SIZE # get fixed images used for comparison for each epoch self.fixed_image = self.prepare_data( next(iter(self.dataloader)) )[0] save_img_results( self.fixed_image.cpu(), None, -1, self.image_dir )
def train(self, train_dataset, test_data, output_dir): train_dataloader = get_dataloader(train_dataset, self.cfg['batch_size'], resize=None, device=self.device) tracker = LossTracker(output_dir) self.set_optimizers_lr(self.cfg['lr']) for epoch in range(self.cfg['epochs']): for batch_real_data in tqdm(train_dataloader): self.perform_train_step(batch_real_data, tracker) tracker.plot() dump_path = os.path.join(output_dir, 'images', f"epoch-{epoch}.jpg") self.save_sample(dump_path, test_data[0], test_data[1]) self.save_train_state(os.path.join(output_dir, "last_ckp.pth"))
def train_shadow_model(dataset, model_type, args, epochs, lr, hidden_attribute, class_distribution, device, size=2000, filename='test'): dataloader = datasets.get_dataloader(dataset, hidden_attribute, size, class_distribution) net = utils.get_model(model_type).to(device) tic = time.time() criterion = nn.MSELoss(reduction='sum') optimizer = optim.Adam(net.parameters(), lr=lr) losses = [] for epoch in range(epochs): running_loss = 0 for i, data in enumerate(dataloader, 0): inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = net(inputs).to(device) loss = criterion(outputs, labels.float()) loss.backward() optimizer.step() running_loss += loss.item() losses.append(running_loss / (i + 1)) tac = time.time() print(f'[{epoch+1}] loss: {running_loss/(i+1):.3f} - {int(tac-tic)} sec') path = os.path.join(args.models_dir, f'{filename}.pth') torch.save(net.state_dict(), path)
def run(args, log): df = pd.read_csv(args.df_path) df_train = df[df['Fold']!=args.fold] df_valid = df[df['Fold']==args.fold] dfs = {} dfs['train'] = df_train dfs['val'] = df_valid model = get_model(args).cuda() if args.mode != 'segmentation': for param in model.model.encoder.parameters(): param.requires_grad = True for param in model.model.decoder.parameters(): param.requires_grad = True for params in model.model.classification_head.parameters(): params.requires_grad = False elif args.mode == 'classification': for param in model.model.encoder.parameters(): param.requires_grad = False for param in model.model.decoder.parameters(): param.requires_grad = False for param in model.classification_head.parameters(): param.requires_grad = True criterion = get_loss(args) optimizer = get_optimizer(args, model) if args.initial_ckpt is not None: last_epoch, step = checkpoint.load_checkpoint(args, model, checkpoint=args.initial_ckpt) log.write(f'Resume training from {args.initial_ckpt} @ {last_epoch}\n') else: last_epoch, step = -1, -1 dataloaders = {mode:get_dataloader(args.data_dir, dfs[mode], mode, args.pretrain, args.batch_size) for mode in ['train', 'val']} seed_everything(seed=123) clr = CLR(optimizer, len(dataloaders['train'])) train(args, model, dataloaders['train'], criterion, optimizer, clr)
def run(args): df = pd.read_csv(args.df_path) df_train = df[df['fold'] != args.fold] model = get_model(args).cuda() dataloader = get_dataloader(args.data_dir, df_train, 'train', args.pretrain, args.batch_size) checkpoints = get_checkpoints(args) checkpoint.load_checkpoint( args, model, None, checkpoint=checkpoints[0] ) # args, model, ckpt_name, checkpoint=None, optimizer=None for i, ckpt in enumerate(checkpoints[1:]): print(i, ckpt) model2 = get_model(args).cuda() last_epoch, _ = checkpoint.load_checkpoint(args, model2, None, checkpoint=ckpt) if args.ema is None: swa.moving_average(model, model2, 1. / (i + 2)) else: swa.moving_average(model, model2, args.ema) with torch.no_grad(): swa.bn_update(dataloader, model) if args.ema is not None: output_name = f'model_ema_{len(checkpoints)}' else: output_name = f'model_swa_{len(checkpoints)}' print('save {}'.format(output_name)) checkpoint.save_checkpoint(args, model, None, 0, 0, name=output_name, weights_dict={'state_dict': model.state_dict()})
def run(config): train_group_csv_dir = './data/group_csv/' writer = SummaryWriter(config.train.dir) train_filenames = list(glob.glob(os.path.join(train_group_csv_dir, 'data_train_group_*')))[1:] for ti, train_file in tqdm.tqdm(enumerate(train_filenames)): gi_tr = train_file.replace('data_train_group_', '') gi_tr = gi_tr.split('/')[-1] gi_tr = gi_tr.replace('.csv', '') group_idx = int(gi_tr) utils.prepare_train_directories(config, group_idx) model = get_model(config, group_idx) if torch.cuda.is_available(): model = model.cuda() criterion = get_loss(config) optimizer = get_optimizer(config, model.parameters()) checkpoint = utils.checkpoint.get_initial_checkpoint(config, group_idx) if checkpoint is not None: last_epoch, step = utils.checkpoint.load_checkpoint(model, optimizer, checkpoint) else: last_epoch, step = -1, -1 if last_epoch > config.train.num_epochs: print('group -- ', str(group_idx), '-- index-', ti, ' ----已xl,跳过') continue print('from checkpoint: {} last epoch:{}'.format(checkpoint, last_epoch)) print('group -- ', str(group_idx), '-- index-', ti) scheduler = get_scheduler(config, optimizer, last_epoch) dataloaders = {split:get_dataloader(config, group_idx, split, get_transform(config, split)) for split in ['train', 'val']} train(config,group_idx, model, dataloaders, criterion, optimizer, scheduler, writer, last_epoch+1)
def inference(config, model, split, src_file, tta, output_filename=None): if split == 'test': data_path = '../data/dicom-images-test' else: data_path = '../data/dicom-images-train' dataset = CustomTestDataset(data_path, src_file, split, config[INPUT_SIZE]) dataloader = get_dataloader(dataset, 1) model = model.cuda() model.eval() key_list = [] probability_list = [] with torch.no_grad(): total_step = len(dataloader) for i, (images, id) in tqdm.tqdm(enumerate(dataloader), total=total_step): images = torch.cat(images, dim=0) images = images.cuda() logits = model(images) mean_logits = torch.mean(logits, dim=0, keepdim=True) probabilities = torch.softmax(mean_logits, dim=-1) probability_list.append(probabilities.cpu().numpy()) key_list.extend(id) probabilities = np.concatenate(probability_list, axis=0) assert probabilities.shape[-1] == NUM_CLASSES records = [] for id, probability in zip(key_list, probabilities): records.append(tuple([id] + ['{:.04f}'.format(p) for p in probability])) columns = ['id'] + ['P{:04d}'.format(l) for l in range(NUM_CLASSES)] df = pd.DataFrame.from_records(records, columns=columns) print('save {}'.format(output_filename)) df.to_csv(output_filename, index=False)
def train(): logging.info("==========loading data==========") train_data, valid_data, test_data = get_dataloader(Config) logging.info("==========end==========") logging.info("==========loading model==========") model = getattr(models, Config.model.name)(Config.model.num_class) logging.info("==========end==========") optimizer = getattr(optim, Config.train.optimizer)(model.parameters(), lr=Config.train.lr, weight_decay=Config.train.wd, momentum=Config.train.momentum) ce_loss = nn.CrossEntropyLoss() lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau() model = model.cuda() model_solver = Solver(Config, model) model_solver.fit(train_data=train_data, valid_data=valid_data, optimizer=optimizer, criterion=ce_loss, lr_schduler=lr_scheduler)
def __init__(self, output_dir): # make dir for all kinds of output self.model_dir = os.path.join(output_dir , 'Model') os.makedirs(self.model_dir) self.image_dir = os.path.join(output_dir , 'Image') os.makedirs(self.image_dir) self.opt_dir = os.path.join(output_dir , 'Opt') os.makedirs(self.opt_dir) self.output_csv_file = os.path.join(output_dir , 'losses.csv') # make dataloader and code buffer self.dataloader = get_dataloader() # other variables self.batch_size = cfg.TRAIN.BATCH_SIZE self.patch_stride = 4.0 self.n_out = 24 self.recp_field = 34 # get fixed images used for comparison for each epoch self.fixed_image = self.prepare_data( next(iter(self.dataloader)) )[1] save_img_results( self.fixed_image.cpu(), None, -1, self.image_dir )
def submit(args, log): df = pd.read_csv(args.df_path) df['Image'] = df.Image_Label.map(lambda v: v[:v.find('_')]) print(df.head()) model = get_model(args).cuda() last_epoch, step = checkpoint.load_checkpoint(args, model, checkpoint=args.initial_ckpt) log.write(f'Loaded checkpoint from {args.initial_ckpt} @ {last_epoch}\n') dataloader = get_dataloader(args.data_dir, df, 'test', args.pretrain, args.batch_size) seed_everything() # inference test_ids, mask_predictions = inference_submit(model, dataloader, args.tta_augment) assert len(test_ids) == mask_predictions.shape[0] ids = [] rles = [] for i, image_id in tqdm.tqdm(enumerate(test_ids), total=len(test_ids)): predictions = mask_predictions[i] for cls_idx in range(4): prediction = predictions[cls_idx, :, :] H, W = prediction.shape assert H == 350 and W == 525 rle_encoded = mask2rle(prediction) assert np.all(rle2mask(H, W, rle_encoded) == prediction) ids.append(f'{image_id}_{LABEL_LIST[cls_idx]}') rles.append(rle_encoded) df_submission = pd.DataFrame({'Image_Label': ids, 'EncodedPixels': rles}) df_submission.to_csv(args.sub_name, index=False) print(df_submission.head())