def main(cfg: DictConfig): print('Cassava Leaf Disease Classification') cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Config ------------------------------------------------------------------- data_dir = './input' seed_everything(cfg.data.seed) # Comet_ml experiment = Experiment(api_key=cfg.comet_ml.api_key, project_name=cfg.comet_ml.project_name, auto_param_logging=False, auto_metric_logging=False) # Log Parameters experiment.log_parameters(dict(cfg.data)) experiment.log_parameters(dict(cfg.train)) # Data Module --------------------------------------------------------------- transform = get_transforms(transform_name=cfg.data.transform, img_size=cfg.data.img_size) cv = StratifiedKFold(n_splits=cfg.data.n_splits, shuffle=True, random_state=cfg.data.seed) dm = CassavaDataModule(data_dir, cfg, transform, cv, use_merge=True, sample=DEBUG) # Model ---------------------------------------------------------------------- net = Timm_model(cfg.train.model_type, pretrained=True) # Log Model Graph experiment.set_model_graph(str(net)) # Loss fn --------------------------------------------------------------------- df = pd.read_csv('./input/merged.csv') weight = df['label'].value_counts().sort_index().tolist() weight = [w / len(df) for w in weight] weight = torch.tensor(weight).cuda() del df criterion = get_loss_fn(cfg.train.loss_fn, weight=weight, smoothing=0.05) # Optimizer, Scheduler -------------------------------------------------------- if cfg.train.use_sam: base_optimizer = RAdam optimizer = SAM(net.parameters(), base_optimizer, lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) else: optimizer = RAdam(net.parameters(), lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.train.epoch, eta_min=0) # Lightning Module ------------------------------------------------------------- model = CassavaLightningSystem(net, cfg, criterion=criterion, optimizer=optimizer, scheduler=scheduler, experiment=experiment) # Trainer ------------------------------------------------------------------------- trainer = Trainer( logger=False, max_epochs=cfg.train.epoch, gpus=-1, amp_backend='apex', amp_level='O2', num_sanity_val_steps=0, # Skip Sanity Check automatic_optimization=False if cfg.train.use_sam else True, # resume_from_checkpoint='./checkpoints/epoch=3-step=14047.ckpt' ) # Train trainer.fit(model, datamodule=dm)
def train_runner(model: nn.Module, model_name: str, results_dir: str, experiment: str = '', debug: bool = False, img_size: int = IMG_SIZE, learning_rate: float = 1e-2, fold: int = 0, checkpoint: str = '', epochs: int = 15, batch_size: int = 8, num_workers: int = 4, from_epoch: int = 0, save_oof: bool = False, save_train_oof: bool = False, gpu_number: int = 1): """ Model training runner Args: model : PyTorch model model_name : string name for model for checkpoints saving results_dir : directory to save results experiment : string name for naming experiments debug : if True, runs the debugging on few images img_size : size of images for training learning_rate: initial learning rate (default = 1e-2) fold : training fold (default = 0) epochs : number of the last epochs to train batch_size : number of images in batch num_workers : number of workers available from_epoch : number of epoch to continue training save_oof : saves oof validation predictions. Default = False """ device = torch.device( f'cuda:{gpu_number}' if torch.cuda.is_available() else 'cpu') print(device) # load model weights to continue training if checkpoint != '': model, ckpt = load_model(model, checkpoint) best_val_metric = ckpt['valid_miou'] best_val_loss = ckpt['valid_loss'] start_epoch = ckpt['epoch'] + 1 print('Loaded model from {}, epoch {}'.format(checkpoint, start_epoch - 1)) model.to(device) # creates directories for checkpoints, tensorboard and predicitons checkpoints_dir = f'{results_dir}/checkpoints/{model_name}' predictions_dir = f'{results_dir}/oof/{model_name}' tensorboard_dir = f'{results_dir}/tensorboard/{model_name}' validations_dir = f'{results_dir}/oof_val/{model_name}' os.makedirs(checkpoints_dir, exist_ok=True) os.makedirs(predictions_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(validations_dir, exist_ok=True) logger = Logger(tensorboard_dir) print('\n', model_name, '\n') model = model.to(device) # datasets for train and validation df = pd.read_csv(f'{TRAIN_DIR}folds.csv') df_train = df[df.fold != fold] df_val = df[df.fold == fold] print(len(df_train.ImageId.values), len(df_val.ImageId.values)) train_dataset = SARDataset( sars_dir=TRAIN_SAR, masks_dir=TRAIN_MASKS, labels_df=df_train, img_size=img_size, transforms=TRANSFORMS["medium"], preprocess=True, normalise=True, debug=debug, ) valid_dataset = SARDataset( sars_dir=TRAIN_SAR, masks_dir=TRAIN_MASKS, labels_df=df_val, img_size=img_size, transforms=TRANSFORMS["d4"], preprocess=True, normalise=True, debug=debug, ) # dataloaders for train and validation dataloader_train = DataLoader(train_dataset, num_workers=num_workers, batch_size=batch_size, shuffle=True) dataloader_valid = DataLoader(valid_dataset, num_workers=num_workers, batch_size=batch_size, shuffle=False, drop_last=True) print('{} training images, {} validation images'.format( len(train_dataset), len(valid_dataset))) # optimizers and schedulers # optimizer = AdamW(model.parameters(), lr=learning_rate) optimizer = RAdam(model.parameters(), lr=learning_rate) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, verbose=True, factor=0.2, min_lr=1e-7) num_batches = len(train_dataset) // batch_size + 1 scheduler_cos = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_batches, eta_min=1e-6, last_epoch=-1) # load optimizer state continue training #if checkpoint != '': # optimizer = load_optim(optimizer, checkpoint, device) # criteria criterion1 = nn.BCEWithLogitsLoss() criterion = BCEJaccardLoss(bce_weight=2, jaccard_weight=0.5, log_loss=False, log_sigmoid=True) #criterion = JaccardLoss(log_sigmoid=True, log_loss=False) # logging #if make_log: report_batch = 20 report_epoch = 20 log_file = os.path.join(checkpoints_dir, f'{experiment}fold_{fold}.log') logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG) logging.info( f'Parameters:\n model_name: {model_name}\n, results_dir: {results_dir}\n, experiment: {experiment}\n, img_size: {img_size}\n, \ learning_rate: {learning_rate}\n, fold: {fold}\n, epochs: {epochs}\n, batch_size: {batch_size}\n, num_workers: {num_workers}\n, \ start_epoch: {start_epoch}\n, save_oof: {save_oof}\n, optimizer: {optimizer}\n, scheduler: {scheduler} \n, checkpoint: {start_epoch} \n' ) train_losses, val_losses = [], [] best_val_loss = 1e+5 best_val_metric = 0 # training cycle print("Start training") for epoch in range(start_epoch, start_epoch + epochs + 1): print("Epoch", epoch) epoch_losses = [] progress_bar = tqdm(dataloader_train, total=len(dataloader_train)) progress_bar.set_description('Epoch {}'.format(epoch)) with torch.set_grad_enabled( True): # --> sometimes people write it, idk for batch_num, (img, target, _) in enumerate(progress_bar): img = img.to(device) target = target.float().to(device) prediction = model(img).to(device) loss = criterion(prediction, target) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 2) optimizer.step() epoch_losses.append(loss.detach().cpu().numpy()) scheduler_cos.step() if debug: # get current learning rate for param_group in optimizer.param_groups: learning_rate = param_group['lr'] print(f'current learning_rate: {learning_rate}') if batch_num and batch_num % report_batch == 0: for param_group in optimizer.param_groups: learning_rate = param_group['lr'] logging.info( f'epoch: {epoch}; step: {batch_num}; learning_rate: {learning_rate}; loss: {np.mean(epoch_losses)} \n' ) # log loss history print("Epoch {}, Train Loss: {}".format(epoch, np.mean(epoch_losses))) train_losses.append(np.mean(epoch_losses)) logger.scalar_summary('loss_train', np.mean(epoch_losses), epoch) logging.info( f'epoch: {epoch}; step: {batch_num}; loss: {np.mean(epoch_losses)} \n' ) # validate model val_loss = validate_loss(model, dataloader_valid, criterion1, epoch, validations_dir, device) valid_metrics = validate(model, dataloader_valid, criterion, epoch, validations_dir, save_oof, debug, device) # logging metrics logger.scalar_summary('loss_valid', valid_metrics['val_loss'], epoch) logger.scalar_summary('miou_valid', valid_metrics['miou'], epoch) valid_loss, val_metric = valid_metrics['val_loss'], valid_metrics[ 'miou'] logging.info( f'epoch: {epoch}; val_bce: {val_loss}; val_loss: {valid_loss}; val_miou: {val_metric}\n' ) val_losses.append(valid_metrics['val_loss']) # get current learning rate for param_group in optimizer.param_groups: learning_rate = param_group['lr'] print(f'learning_rate: {learning_rate}') logging.info(f'learning_rate: {learning_rate}\n') scheduler.step(val_metric) # save the best metric if valid_metrics['miou'] > best_val_metric: best_val_metric = valid_metrics['miou'] # save model, optimizer and losses after every epoch print( f"Saving model with the best val metric {valid_metrics['miou']}, epoch {epoch}" ) checkpoint_filename = "{}_best_val_miou.pth".format(model_name) checkpoint_filepath = os.path.join(checkpoints_dir, checkpoint_filename) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'loss': np.mean(epoch_losses), 'valid_loss': valid_metrics['val_loss'], 'valid_miou': valid_metrics['miou'], }, checkpoint_filepath) # save the best loss if valid_metrics['val_loss'] < best_val_loss: best_val_loss = valid_metrics['val_loss'] # save model, optimizer and losses after every epoch print( f"Saving model with the best val loss {valid_metrics['val_loss']}, epoch {epoch}" ) checkpoint_filename = "{}_best_val_loss.pth".format(model_name) checkpoint_filepath = os.path.join(checkpoints_dir, checkpoint_filename) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'loss': np.mean(epoch_losses), 'valid_loss': valid_metrics['val_loss'], 'valid_miou': valid_metrics['miou'], }, checkpoint_filepath) # save model, optimizer and losses after every n epoch elif epoch % report_epoch == 0: print( f"Saving model at epoch {epoch}, val loss {valid_metrics['val_loss']}" ) checkpoint_filename = "{}_epoch_{}.pth".format(model_name, epoch) checkpoint_filepath = os.path.join(checkpoints_dir, checkpoint_filename) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'loss': np.mean(epoch_losses), 'valid_loss': valid_metrics['val_loss'], 'valid_miou': valid_metrics['miou'], }, checkpoint_filepath) # Early stopping if learning_rate == 1e-7: print( f"Stop trainig, reached minimal LR: {learning_rate} at epoch {epoch}" ) break
def __init__(self, T_max, eta_min=0): super().__init__(lambda opt: _scheduler.CosineAnnealingLR( opt, T_max, eta_min=eta_min))
def train(): ### Load Dataset normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), # 3*H*W, [0, 1] normalize, ]) dataset = PETADataLoader( listfile= '/home/duycuong/PycharmProjects/research_py3/tensorflow_slim/data/PETA/train_list_v2.txt', transform=transform) data_loader = data.DataLoader(dataset, BATCH_SIZE, num_workers=args.num_worker, shuffle=True) #print (dataset.get_classNum()) ### Build Model net_ = dm.DeepMAR_res50(dataset.get_classNum()) if args.resume: print('Resuming training, loading {}...'.format(args.resume)) net_.load_weights(args.resume) if args.cuda: if len(gpu_ids) > 1: net_ = torch.nn.DataParallel(net_, device_ids=gpu_ids).cuda() else: #device = torch.device('cuda:1') #torch.cuda.set_device(gpu_ids[0]) net_ = net_.cuda() cudnn.benchmark = True optimizer = optim.SGD(net_.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) criterion = F.binary_cross_entropy_with_logits #https://discuss.pytorch.org/t/bceloss-vs-bcewithlogitsloss/33586/8 scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.max_epoch) net_.train() # loss counters total_loss = 0 epoch = 0 print('Loading the dataset...', len(dataset)) epoch_size = len(dataset) // BATCH_SIZE print('Using the specified args:') print(args) step_index = 0 # create batch iterator batch_iterator = iter(data_loader) iter_counter = 0 for iteration in range(0, args.max_iter): iter_counter += 1 if iteration != 0 and (iteration % epoch_size == 0): # reset epoch loss counters writer.add_scalar('data/total_loss_epoch', total_loss / len(dataset), epoch) total_loss = 0 epoch += 1 if epoch > args.max_epoch: break if iteration % epoch_size == 0: if epoch > args.warm_up: scheduler.step(epoch - args.warm_up) elif epoch == args.warm_up: for param_group in optimizer.param_groups: param_group['lr'] = args.lr else: lrr = 1e-4 + (args.lr - 1e-4) * iteration / (epoch_size * args.warm_up) for param_group in optimizer.param_groups: param_group['lr'] = lrr if iter_counter >= len(batch_iterator): batch_iterator = iter(data_loader) iter_counter = 0 # load train data images, targets = next(batch_iterator) #print (images.shape) if args.cuda: images = Variable(images.cuda()) with torch.no_grad(): targets = Variable(targets.cuda()) #targets = [Variable(ann.cuda()) for ann in targets] else: images = Variable(images) with torch.no_grad(): targets = Variable(targets) #targets = [Variable(ann) for ann in targets] # forward t0 = time.time() out = net_(images) # backprop optimizer.zero_grad() loss_ = criterion(out, targets) loss_.backward() optimizer.step() t1 = time.time() # add log total_loss += loss_.item() #map_loss += 0#loss_map.item() if iteration % 10 == 0: print('timer: %.4f sec.' % (t1 - t0)) print('iter ' + repr(iteration) + '||epoch:' + repr(epoch) + ' || Loss: %.4f ||' % (loss_.item()), end=' ') writer.add_scalar('data/total_loss_iter', loss_.item(), iteration) ### save if iteration != 0 and iteration % 1000 == 0: print('Saving state, iter:', iteration) torch.save( net_.state_dict(), os.path.join(save_dir, 'models', modelName + '_iter-' + repr(iteration) + '.pth')) torch.save( net_.state_dict(), os.path.join(save_dir, 'models', modelName + '_iter-' + repr(iteration) + '.pth'))
def train(args): device = torch.device(args.device) text_field = TextField() label_field = LabelField() train_dataset, valid_dataset, test_dataset = load_data( root='data', text_field=text_field, label_field=label_field) # Our model will be run in 'open-vocabulary' mode. text_field.build_vocab(train_dataset, valid_dataset, test_dataset) label_field.build_vocab(train_dataset) text_field.vocab.load_vectors(args.word_vector) train_loader, valid_loader, test_loader = data.Iterator.splits( datasets=(train_dataset, valid_dataset, test_dataset), batch_size=args.batch_size, device=device) config_path = os.path.join(args.save_dir, 'config.yml') with open(config_path, 'r') as f: config = yaml.load(f) model = SSTModel(num_words=len(text_field.vocab), num_classes=len(label_field.vocab), **config['model']) model.word_embedding.weight.data.set_(text_field.vocab.vectors) model.word_embedding.weight.requires_grad = not args.fix_word_embeddings print(model) model.to(device) num_params = sum(p.numel() for p in model.parameters()) num_intrinsic_params = num_params - model.word_embedding.weight.numel() logger.info(f'* # of params: {num_params}') logger.info(f' - Intrinsic: {num_intrinsic_params}') logger.info(f' - Word embedding: {num_params - num_intrinsic_params}') trainable_params = [p for p in model.parameters() if p.requires_grad] if args.optimizer == 'adam': optimizer = optim.Adam(trainable_params) elif args.optimizer == 'adadelta': optimizer = optim.Adadelta(trainable_params) else: raise ValueError('Unknown optimizer') assert not args.warm_restart or args.cosine_lr if args.cosine_lr: if not args.warm_restart: scheduler = lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=len(train_loader) * args.max_epoch) else: scheduler = lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=len(train_loader) * 2) else: scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=4, verbose=True) criterion = nn.CrossEntropyLoss() def run_iter(batch): text, length = batch.text label = batch.label logit = model(inputs=text, length=length) clf_loss = criterion(input=logit, target=label) pred = logit.max(1)[1] accuracy = torch.eq(pred, label).float().mean() if model.training: if args.l2_weight > 0: l2_norm = sum(p.pow(2).sum() for p in trainable_params).sqrt() else: l2_norm = 0 loss = clf_loss + args.l2_weight * l2_norm optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(trainable_params, max_norm=5) optimizer.step() return clf_loss.item(), accuracy.item() def validate(loader): model.eval() clf_loss_sum = accuracy_sum = 0 num_valid_data = len(loader.dataset) with torch.no_grad(): for valid_batch in loader: clf_loss, accuracy = run_iter(valid_batch) clf_loss_sum += clf_loss * valid_batch.batch_size accuracy_sum += accuracy * valid_batch.batch_size clf_loss = clf_loss_sum / num_valid_data accuracy = accuracy_sum / num_valid_data return clf_loss, accuracy train_summary_writer = SummaryWriter( os.path.join(args.save_dir, 'log', 'train')) valid_summary_writer = SummaryWriter( os.path.join(args.save_dir, 'log', 'valid')) validate_every = len(train_loader) // args.verbosity best_valid_accuracy = 0 global_step = 0 logger.info('Training starts!') for train_batch in train_loader: if not model.training: model.train() train_clf_loss, train_accuracy = run_iter(train_batch) global_step += 1 if args.cosine_lr: if not args.warm_restart: scheduler.step() else: if scheduler.last_epoch == scheduler.T_max: scheduler.T_max = scheduler.T_max * 2 scheduler.step(0) logger.info('Warm-restarted the learning rate!') else: scheduler.step() train_summary_writer.add_scalar(tag='clf_loss', scalar_value=train_clf_loss, global_step=global_step) train_summary_writer.add_scalar(tag='accuracy', scalar_value=train_accuracy, global_step=global_step) if global_step % validate_every == 0: progress = train_loader.iterations / len(train_loader) logger.info(f'* Epoch {progress:.2f}') logger.info(f' - lr = {optimizer.param_groups[0]["lr"]:.6f}') logger.info(f' - Validation starts') valid_clf_loss, valid_accuracy = validate(valid_loader) _, test_accuracy = validate(test_loader) if not args.cosine_lr: scheduler.step(valid_accuracy) valid_summary_writer.add_scalar(tag='clf_loss', scalar_value=valid_clf_loss, global_step=global_step) valid_summary_writer.add_scalar(tag='accuracy', scalar_value=valid_accuracy, global_step=global_step) valid_summary_writer.add_scalar( tag='lr', scalar_value=optimizer.param_groups[0]['lr'], global_step=global_step) logger.info(f' - Valid clf loss: {valid_clf_loss:.5f}') logger.info(f' - Valid accuracy: {valid_accuracy:.5f}') logger.info(f' - Test accuracy: {test_accuracy:.5f}') if valid_accuracy > best_valid_accuracy: best_valid_accuracy = valid_accuracy model_filename = (f'best-{progress:.2f}' f'-{valid_clf_loss:.5f}' f'-{valid_accuracy:.5f}.pt') model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) logger.info(f' - Saved the new best model to: {model_path}') elif args.save_every_epoch and global_step % (validate_every * 10) == 0: model_filename = (f'model-{progress:.2f}' f'-{valid_clf_loss:.5f}' f'-{valid_accuracy:.5f}.pt') model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) logger.info(f' - Saved the new model to: {model_path}') if train_loader.epoch > args.max_epoch: break
def train_prune_32bit(model, dataloader, test_loader, best_model_wts_init, args, prune_rate=50.): device = model.device momentum = model.momentum learning_rate = model.lr num_epochs = model.num_epochs milestones = model.milestones gamma = model.gamma weight_decay = model.weight_decay nesterov = model.nesterov if args.label_regularize == 'labelsmooth': criterion = LabelSmoothingLoss(model.device, model.num_classes, 0.1, 1) else: criterion = model.criterion batch_number = len(dataloader.dataset) // dataloader.batch_size if args.batch_wd: optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, nesterov=nesterov, weight_decay=weight_decay) else: batch_params = [ module for module in model.parameters() if module.ndimension() == 1 ] other_params = [ module for module in model.parameters() if module.ndimension() > 1 ] optimizer = torch.optim.SGD([{ 'params': batch_params, 'weight_decay': 0 }, { 'params': other_params, 'weight_decay': weight_decay }], lr=learning_rate, momentum=momentum, nesterov=nesterov) if args.lr_type == 'step': scheduler = lr_scheduler.MultiStepLR(gamma=gamma, milestones=milestones, optimizer=optimizer) elif args.lr_type == 'cos': scheduler = lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=num_epochs, eta_min=0.0005, last_epoch=-1) losses = [] test_losses = [] accuracies = [] test_accuracies = [] best_acc = 0 best_model_wts = copy.deepcopy(model.state_dict()) ortho_miles = [] for epoch in range(num_epochs): #get mask if epoch == 0: masks = weight_prune(model, prune_rate) model.load_state_dict(best_model_wts_init, strict=False) model.set_masks(masks) model.train() if args.label_regularize == 'labelsimilar': similarity = fc_similarity(model, device) criterion = LabelSimilarLoss(model.device, model.num_classes, similarity, 0.1, 1) scheduler.step() for i, (images, labels) in enumerate(tqdm(dataloader)): images = images.type(torch.FloatTensor).to(device) labels = labels.type(torch.LongTensor).to(device) if args.input_regularize: if args.input_regularize == 'cutmix': lam, images, labels_a, labels_b = cutmix_32bit( images, labels, device) elif args.input_regularize == 'mixup': lam, images, labels_a, labels_b = mixup_32bit( images, labels, device) optimizer.zero_grad() outputs = model(images) loss = lam * criterion(outputs, labels_a) + ( 1 - lam) * criterion(outputs, labels_b) else: optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) if args.ortho: loss += args.ortho_lr * l2_reg_ortho_32bit(model, device) losses.append(loss.item()) loss.backward() optimizer.step() if (i + 1) % (batch_number // 4) == 0: tqdm.write( 'Epoch[{}/{}] , Step[{}/{}], Loss: {:.4f}, lr = {}'.format( epoch + 1, num_epochs, i + 1, len(dataloader), loss.item(), optimizer.param_groups[0]['lr'])) #print('|| Train || === ', end = '') model.set_masks(masks) #tr_accuracy, tr_loss = eval_16bit(model, dataloader) print('|| Test || === ', end='') test_accuracy, test_loss = eval_32bit(model, test_loader) if test_accuracy > best_acc: best_acc = test_accuracy best_model_wts = copy.deepcopy(model.state_dict()) # accuracies.append(tr_accuracy) #losses.append(tr_loss) test_accuracies.append(test_accuracy) test_losses.append(test_loss) return losses, accuracies, test_losses, test_accuracies, best_model_wts
def main(): config = vars(parse_args()) if config['name'] is None: if config['deep_supervision']: config['name'] = '%s_%s_wDS' % (config['dataset'], config['arch']) else: config['name'] = '%s_%s_woDS' % (config['dataset'], config['arch']) os.makedirs('models/%s' % config['name'], exist_ok=True) print('-' * 20) for key in config: print('%s: %s' % (key, config[key])) print('-' * 20) with open('models/%s/config.yml' % config['name'], 'w') as f: yaml.dump(config, f) # define loss function (criterion) if config['loss'] == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda( ) #WithLogits 就是先将输出结果经过sigmoid再交叉熵 else: criterion = losses.__dict__[config['loss']]().cuda() cudnn.benchmark = True # create model print("=> creating model %s" % config['arch']) model = archs.__dict__[config['arch']](config['num_classes'], config['input_channels'], config['deep_supervision']) model = model.cuda() params = filter(lambda p: p.requires_grad, model.parameters()) if config['optimizer'] == 'Adam': optimizer = optim.Adam(params, lr=config['lr'], weight_decay=config['weight_decay']) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(params, lr=config['lr'], momentum=config['momentum'], nesterov=config['nesterov'], weight_decay=config['weight_decay']) else: raise NotImplementedError if config['scheduler'] == 'CosineAnnealingLR': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['epochs'], eta_min=config['min_lr']) elif config['scheduler'] == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=config['factor'], patience=config['patience'], verbose=1, min_lr=config['min_lr']) elif config['scheduler'] == 'MultiStepLR': scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[int(e) for e in config['milestones'].split(',')], gamma=config['gamma']) elif config['scheduler'] == 'ConstantLR': scheduler = None else: raise NotImplementedError # Data loading code img_ids = glob( os.path.join('inputs', config['dataset'], 'images', '*' + config['img_ext'])) img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids] train_img_ids, val_img_ids = train_test_split(img_ids, test_size=0.2, random_state=41) #数据增强: train_transform = Compose([ transforms.RandomRotate90(), transforms.Flip(), OneOf([ transforms.HueSaturationValue(), transforms.RandomBrightness(), transforms.RandomContrast(), ], p=1), #按照归一化的概率选择执行哪一个 transforms.Resize(config['input_h'], config['input_w']), transforms.Normalize(), ]) val_transform = Compose([ transforms.Resize(config['input_h'], config['input_w']), transforms.Normalize(), ]) train_dataset = Dataset(img_ids=train_img_ids, img_dir=os.path.join('inputs', config['dataset'], 'images'), mask_dir=os.path.join('inputs', config['dataset'], 'masks'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], transform=train_transform) val_dataset = Dataset(img_ids=val_img_ids, img_dir=os.path.join('inputs', config['dataset'], 'images'), mask_dir=os.path.join('inputs', config['dataset'], 'masks'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], transform=val_transform) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=config['num_workers'], drop_last=True) #不能整除的batch是否就不要了 val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=config['num_workers'], drop_last=False) log = OrderedDict([ ('epoch', []), ('lr', []), ('loss', []), ('iou', []), ('val_loss', []), ('val_iou', []), ]) best_iou = 0 trigger = 0 for epoch in range(config['epochs']): print('Epoch [%d/%d]' % (epoch, config['epochs'])) # train for one epoch train_log = train(config, train_loader, model, criterion, optimizer) # evaluate on validation set val_log = validate(config, val_loader, model, criterion) if config['scheduler'] == 'CosineAnnealingLR': scheduler.step() elif config['scheduler'] == 'ReduceLROnPlateau': scheduler.step(val_log['loss']) print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' % (train_log['loss'], train_log['iou'], val_log['loss'], val_log['iou'])) log['epoch'].append(epoch) log['lr'].append(config['lr']) log['loss'].append(train_log['loss']) log['iou'].append(train_log['iou']) log['val_loss'].append(val_log['loss']) log['val_iou'].append(val_log['iou']) pd.DataFrame(log).to_csv('models/%s/log.csv' % config['name'], index=False) trigger += 1 if val_log['iou'] > best_iou: torch.save(model.state_dict(), 'models/%s/model.pth' % config['name']) best_iou = val_log['iou'] print("=> saved best model") trigger = 0 # early stopping if config['early_stopping'] >= 0 and trigger >= config[ 'early_stopping']: print("=> early stopping") break torch.cuda.empty_cache()
def train(): if args.dataset == 'COCO': if args.dataset_root == VOC_ROOT: if not os.path.exists(COCO_ROOT): parser.error('Must specify dataset_root if specifying dataset') print("WARNING: Using default COCO dataset_root because " + "--dataset_root was not specified.") args.dataset_root = COCO_ROOT cfg = coco dataset = COCODetection(root=args.dataset_root, transform=SSDAugmentation( cfg['min_dim'], MEANS)) elif args.dataset == 'VOC': # if args.dataset_root == COCO_ROOT: # parser.error('Must specify dataset if specifying dataset_root') cfg = voc dataset = VOCDetection(root=args.dataset_root, transform=SSDAugmentation( cfg['min_dim'], MEANS)) if args.visdom: import visdom viz = visdom.Visdom() ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) net = ssd_net if args.cuda: net = torch.nn.DataParallel(ssd_net) cudnn.benchmark = True if args.resume: print('[DEBUG] Resuming training, loading {}...'.format(args.resume)) ssd_net.load_weights(args.resume) else: vgg_weights = torch.load(args.save_folder + args.basenet) print('Loading base network...') ssd_net.vgg.load_state_dict(vgg_weights) if args.cuda: net = net.cuda() if not args.resume: print('[DEBUG] Initializing weights...') # initialize newly added layers' weights with xavier method ssd_net.extras.apply(weights_init) ssd_net.loc.apply(weights_init) ssd_net.conf.apply(weights_init) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda, lm=True) sche = lr_scheduler.CosineAnnealingLR(optimizer, T_max=250) # initSummaty() log_folder = './results/' + net.__class__.__name__ + '/' + optimizer.__class__.__name__ + '/' + 'lm/' + str( 1001) + '/' print("log_folder: ", log_folder) writer = SummaryWriter(log_folder) net.train() # loss counters loc_loss = 0 conf_loss = 0 print('Loading the dataset...') epoch_size = len(dataset) // args.batch_size dataset_len = epoch_size print("[DEBUG] dataset len: {}".format(len(dataset))) print('Training SSD on:', dataset.name) print('Using the specified args:') print(args) step_index = 0 if args.visdom: vis_title = 'SSD.PyTorch on ' + dataset.name vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) for epoch in range(args.start_iter, 250): batch_iterator = iter(data_loader) t0 = time.time() sche.step(epoch) for iteration in range(epoch_size): # load train data images, targets = next(batch_iterator) if args.cuda: images = Variable(images.cuda()) targets = [ Variable(anno.cuda(), volatile=True) for anno in targets ] else: images = Variable(images) targets = np.array( [Variable(ann, volatile=True) for ann in targets]) # forward out = net(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(out, targets) # loss_l, loss_c = loss_l.sum(), loss_c.sum() loss = loss_l + loss_c loss.backward() optimizer.step() loc_loss += loss_l.data[0] conf_loss += loss_c.data[0] lr_now = optimizer.param_groups[0]['lr'] print('Epoch [{}/{}] '.format(epoch, 400) + 'iter ' + repr(iteration) + '||Loss: %.4f|loss_l:%.4f|loss_c:%.4f||' % (loss.data[0], loss_l.data[0], loss_c.data[0]) + 'lr={}'.format(lr_now), end='\r') t1 = time.time() print('Epoch [{}/{}] '.format(epoch, 400) + 'timer: %.4f sec.' % (t1 - t0), end='\n') writer.add_scalar('loc_loss', loc_loss, epoch) writer.add_scalar('conf_loss', conf_loss, epoch) lr_now = optimizer.param_groups[0]['lr'] writer.add_scalar('learning rate', lr_now, epoch) # reset epoch loss counters loc_loss = 0 conf_loss = 0 if epoch % 10 == 0: print('Saving state, epoch:', epoch) torch.save( ssd_net.state_dict(), 'weights/lm/ssd300_' + args.dataset + '_' + repr(epoch) + '.pth') torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '.pth')
def train_model_snapshot(model, criterion, lr, dataloaders, dataset_sizes, device, num_cycles, num_epochs_per_cycle): since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 best_loss = 1000000.0 model_w_arr = [] prob = torch.zeros((dataset_sizes['val'], 3), dtype=torch.float32).to(device) lbl = torch.zeros((dataset_sizes['val'], ), dtype=torch.long).to(device) for cycle in range(num_cycles): optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) #, weight_decay = 0.0005) scheduler = lr_scheduler.CosineAnnealingLR( optimizer, num_epochs_per_cycle * len(dataloaders['train'])) for epoch in range(num_epochs_per_cycle): #print('Cycle {}: Epoch {}/{}'.format(cycle, epoch, num_epochs_per_cycle - 1)) #print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 idx = 0 # Iterate over data. for inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) if (epoch == num_epochs_per_cycle - 1) and (phase == 'val'): prob[idx:idx + inputs.shape[0]] += F.softmax( outputs, dim=1) lbl[idx:idx + inputs.shape[0]] = labels idx += inputs.shape[0] loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() scheduler.step() #print(optimizer.param_groups[0]['lr']) # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] #print('{} Loss: {:.4f} Acc: {:.4f}'.format( # phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_loss < best_loss: best_loss = epoch_loss best_model_wts = copy.deepcopy(model.state_dict()) #print() model_w_arr.append(copy.deepcopy(model.state_dict())) prob /= num_cycles ensemble_loss = F.nll_loss(torch.log(prob), lbl) ensemble_loss = ensemble_loss.item() time_elapsed = time.time() - since #print('Training complete in {:.0f}m {:.0f}s'.format( # time_elapsed // 60, time_elapsed % 60)) #print('Ensemble Loss : {:4f}, Best val Loss: {:4f}'.format(ensemble_loss, best_loss)) # load best model weights model_arr = [] for weights in model_w_arr: model.load_state_dict(weights) model_arr.append(model) return model_arr, ensemble_loss, best_loss, prob
def main_worker(gpu, ngpus_per_node, argss): global args, best_acc1 args, best_acc1 = argss, 0 if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) model = san(args.sa_type, args.layers, args.kernels, args.classes) criterion = nn.CrossEntropyLoss(ignore_index=args.ignore_label) optimizer = torch.optim.SGD(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) print(args.base_lr, args.momentum, args.weight_decay) if args.scheduler == 'step': scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.step_epochs, gamma=0.1) elif args.scheduler == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) if main_process(): global logger, writer logger = get_logger() writer = SummaryWriter(args.save_path) logger.info(args) logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) logger.info(model) if args.distributed: torch.cuda.set_device(gpu) args.batch_size = int(args.batch_size / ngpus_per_node) args.batch_size_val = int(args.batch_size_val / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model.cuda(), device_ids=[gpu]) else: model = torch.nn.DataParallel(model.cuda()) if args.weight: if os.path.isfile(args.weight): if main_process(): logger.info("=> loading weight '{}'".format(args.weight)) checkpoint = torch.load(args.weight) model.load_state_dict(checkpoint['state_dict']) if main_process(): logger.info("=> loaded weight '{}'".format(args.weight)) else: if main_process(): logger.info("=> no weight found at '{}'".format(args.weight)) if args.resume: if os.path.isfile(args.resume): if main_process(): logger.info("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(gpu)) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['top1_val'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) if main_process(): logger.info("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: if main_process(): logger.info("=> no checkpoint found at '{}'".format( args.resume)) mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean, std) ]) train_set = torchvision.datasets.ImageFolder( os.path.join(args.data_root, 'train'), train_transform) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean, std) ]) val_set = torchvision.datasets.ImageFolder( os.path.join(args.data_root, 'val'), val_transform) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) val_sampler = torch.utils.data.distributed.DistributedSampler(val_set) else: train_sampler = None val_sampler = None train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler) loss_val, mIoU_val, mAcc_val, allAcc_val, top1_val, top5_val = validate( val_loader, model, criterion) exit(0) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) loss_train, mIoU_train, mAcc_train, allAcc_train, top1_train, top5_train = train( train_loader, model, criterion, optimizer, epoch)
] lrs = [ 0.000001, 0.000001, 0.000001, 0.000001, 0.00001, 0.00001, 0.00001, 0.0001, 0.001, 0.01 ] criterion = nn.CrossEntropyLoss() optimizer = optim.SGD([{ 'params': p, 'lr': l } for p, l in zip(layers, lrs)], momentum=0.9) # Decay LR by a factor of 0.1 every 7 epochs scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 5, eta_min=0, last_epoch=-1) def train_model(model, criterion, optimizer, scheduler, num_epochs=25): #since = time.time() loss_train = [] loss_test = [] acc_test = [] acc_train = [] best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10)
sample_sub.update(sub) sample_sub.to_csv('submission.csv') if __name__ == '__main__': global_start_time = time.time() train_loader, test_loader, label_encoder, num_classes = load_data( train, test, train_dir, test_dir) model = EfficientNetEncoderHead(depth=0, num_classes=num_classes) model.cuda() criterion = nn.CrossEntropyLoss() optimizer = radam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-3, weight_decay=1e-4) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader) * NUM_EPOCHS, eta_min=1e-6) for epoch in range(1, NUM_EPOCHS + 1): print('-' * 50) train_step(train_loader, model, criterion, optimizer, epoch, scheduler) print('inference mode') generate_submission(test_loader, model, label_encoder)
reduction='mean', beta=1.0) # L2 criterion_L2 = nn.MSELoss(reduction='mean') if True: # Hyperparameter(超參數) learning_rate = 1e-1 num_epochs = 200 weight_decay = 0 # 定義優化器 optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=weight_decay) # scheduler = lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.9) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=200, eta_min=1e-2, last_epoch=-1) model = model.float() switch = True # 訓練模型 train_ls, val_ls, lr = [], [], [] for epoch in range(num_epochs): # 切割資料 train_x = train_x.clone().detach() val_x = val_x.clone().detach() test_x = test_x.clone().detach() train_y = train_y.clone().detach() val_y = val_y.clone().detach() # 切割訓練集 batch_size = 12967
def estimate(X_train,y_train): i = 0 ii = 0 nrows=256 ncolumns=256 channels=1 ntrain=0.8*len(X_train) nval=0.2*len(X_train) batch_size=16 epochs= 2 num_cpu = multiprocessing.cpu_count() num_classes = 2 torch.manual_seed(8) torch.cuda.manual_seed(8) np.random.seed(8) random.seed(8) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") X = [] X_train=np.reshape(np.array(X_train),[len(X_train),]) for img in list(range(0,len(X_train))): if X_train[img].ndim>=3: X.append(np.moveaxis(cv2.resize(X_train[img][:,:,:3], (nrows,ncolumns),interpolation=cv2.INTER_CUBIC), -1, 0)) else: smimg= cv2.cvtColor(X_train[img],cv2.COLOR_GRAY2RGB) X.append(np.moveaxis(cv2.resize(smimg, (nrows,ncolumns),interpolation=cv2.INTER_CUBIC), -1, 0)) if y_train[img]=='COVID': y_train[img]=1 elif y_train[img]=='NonCOVID' : y_train[img]=0 else: continue x = np.array(X) y_train = np.array(y_train) outputs_all = [] labels_all = [] X_train, X_val, y_train, y_val = train_test_split(x, y_train, test_size=0.2, random_state=2) image_transforms = { 'train': transforms.Compose([ transforms.Lambda(lambda x: x/255), transforms.ToPILImage(), transforms.Resize((230, 230)), transforms.RandomResizedCrop((224),scale=(0.75,1.0)), transforms.RandomHorizontalFlip(), transforms.RandomRotation(10), #transforms.Affine(10,shear =(0.1,0.1)), # random brightness and random contrast #transforms.ColorJitter(brightness=0.2, contrast=0.2), transforms.ToTensor(), transforms.Normalize([0.45271412, 0.45271412, 0.45271412], [0.33165374, 0.33165374, 0.33165374]) ]), 'valid': transforms.Compose([ transforms.Lambda(lambda x: x/255), transforms.ToPILImage(), transforms.Resize((230, 230)), transforms.CenterCrop(size=224), transforms.ToTensor(), transforms.Normalize([0.45271412, 0.45271412, 0.45271412], [0.33165374, 0.33165374, 0.33165374]) ]) } train_data = MyDataset(X_train, y_train,image_transforms['train']) valid_data = MyDataset(X_val, y_val,image_transforms['valid']) dataset_sizes = { 'train':len(train_data), 'valid':len(valid_data) } dataloaders = { 'train' : data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_cpu, pin_memory=True, worker_init_fn=np.random.seed(7), drop_last=False), 'valid' : data.DataLoader(valid_data, batch_size=batch_size, shuffle=True, num_workers=num_cpu, pin_memory=True, worker_init_fn=np.random.seed(7), drop_last=False) } model = ResidualAttentionModel(10) checkpoint0 = torch.load('model_resAttention.pth') model.load_state_dict(checkpoint0) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs,num_classes) model = nn.DataParallel(model, device_ids=[ 0, 1,2, 3]).cuda() criterion = nn.CrossEntropyLoss() #optimizer = optim.SGD(model.parameters(), lr=0.06775, momentum=0.5518,weight_decay=0.000578) optimizer = optim.Adam(model.parameters(), lr=0.0001,weight_decay=0.05) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) #scheduler = lr_scheduler.StepLR(optimizer, step_size=35, gamma=0.1) best_acc = 0.0 best_f1 = 0.0 best_epoch = 0 best_loss = 100000 since = time.time() writer = SummaryWriter() model.train() for epoch in range(epochs): print('epoch',epoch) jj=0 for phase in ['train', 'valid']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 predictions=FloatTensor() all_labels=FloatTensor() # Iterate over data. for inputs, labels in dataloaders[phase]: inputs = inputs.to(device, non_blocking=True) labels = labels.to(device, non_blocking=True) predictions = predictions.to(device, non_blocking=True) all_labels = all_labels.to(device, non_blocking=True) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) predictions=torch.cat([predictions,preds.float()]) all_labels=torch.cat([all_labels,labels.float()]) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() if phase == 'train': jj+= 1 if len(inputs) >=16 : writer.add_figure('predictions vs. actuals epoch '+str(epoch)+' '+str(jj) , plot_classes_preds(model, inputs, labels)) # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) if phase == 'train': scheduler.step() epoch_f1=f1_score(all_labels.tolist(), predictions.tolist(),average='weighted') epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = accuracy_score(all_labels.tolist(), predictions.tolist()) if phase == 'train': writer.add_scalar('Train/Loss', epoch_loss, epoch) writer.add_scalar('Train/Accuracy', epoch_acc, epoch) writer.flush() elif phase == 'valid': writer.add_scalar('Valid/Loss', epoch_loss, epoch) writer.add_scalar('Valid/Accuracy', epoch_acc, epoch) writer.flush() # deep copy the model if phase == 'valid' and epoch_acc > best_acc: best_f1 = epoch_f1 best_acc = epoch_acc best_loss = epoch_loss best_epoch = epoch best_model_wts = copy.deepcopy(model.module.state_dict()) best_model_wts_module = copy.deepcopy(model.state_dict()) model.load_state_dict(best_model_wts_module) torch.save(model, "Model_res.pth") torch.save(best_model_wts,"Model_res_state.pth") time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best validation Acc: {:4f}'.format(best_acc)) print('Best validation f1: {:4f}'.format(best_f1)) print('best epoch: ', best_epoch) ## Replacing the last fully connected layer with SVM or ExtraTrees Classifiers model.module.fc = nn.Identity() for param in model.parameters(): param.requires_grad_(False) clf = svm.SVC(kernel='rbf', probability=True) all_best_accs = {} all_best_f1s = {} #clf = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=30, random_state=0) for phase in ['train','valid']: outputs_all = [] labels_all = [] model.eval() # Set model to evaluate mode # Iterate over data. for inputs, labels in dataloaders[phase]: inputs = inputs.to(device, non_blocking=True) labels = labels.to(device, non_blocking=True) outputs = model(inputs) outputs_all.append(outputs) labels_all.append(labels) outputs = torch.cat(outputs_all) labels = torch.cat(labels_all) # fit the classifier on training set and then predict on test if phase == 'train': clf.fit(outputs.cpu(), labels.cpu()) filename = 'classifier_model.sav' joblib.dump(clf, filename) all_best_accs[phase]=accuracy_score(labels.cpu(), clf.predict(outputs.cpu())) all_best_f1s[phase]= f1_score(labels.cpu(), clf.predict(outputs.cpu())) print(phase, ' ',accuracy_score(labels.cpu(), clf.predict(outputs.cpu()))) if phase != 'train' : predict = clf.predict(outputs.cpu()) all_best_accs[phase]=accuracy_score(labels.cpu(), clf.predict(outputs.cpu())) all_best_f1s[phase]= f1_score(labels.cpu(), clf.predict(outputs.cpu())) print(phase, ' ',accuracy_score(labels.cpu(), clf.predict(outputs.cpu()))) print('Best Acc: ',all_best_accs) print('Best f1: ',all_best_f1s) return model
if P.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=P.lr_init, momentum=0.9, weight_decay=P.weight_decay) lr_decay_gamma = 0.1 elif P.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=P.lr_init, betas=(.9, .999), weight_decay=P.weight_decay) lr_decay_gamma = 0.3 elif P.optimizer == 'lars': from torchlars import LARS base_optimizer = optim.SGD(model.parameters(), lr=P.lr_init, momentum=0.9, weight_decay=P.weight_decay) optimizer = LARS(base_optimizer, eps=1e-8, trust_coef=0.001) lr_decay_gamma = 0.1 else: raise NotImplementedError() if P.lr_scheduler == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, P.epochs) elif P.lr_scheduler == 'step_decay': milestones = [int(0.5 * P.epochs), int(0.75 * P.epochs)] scheduler = lr_scheduler.MultiStepLR(optimizer, gamma=lr_decay_gamma, milestones=milestones) else: raise NotImplementedError() from training.scheduler import GradualWarmupScheduler scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10.0, total_epoch=P.warmup, after_scheduler=scheduler) if P.multi_gpu: linear = model.module.linear else: linear = model.linear linear_optim = torch.optim.Adam(linear.parameters(), lr=1e-3, betas=(.9, .999), weight_decay=P.weight_decay)
def train(cfg): log_dir = create_log_dir(cfg) device = set_device(cfg) # -------------------------------------------------------------------------- # Data Loading and Preprocessing # -------------------------------------------------------------------------- # __________________________________________________________________________ # Build MONAI preprocessing train_preprocess = Compose([ ToTensorD(keys="image"), TorchVisionD(keys="image", name="ColorJitter", brightness=64.0 / 255.0, contrast=0.75, saturation=0.25, hue=0.04), ToNumpyD(keys="image"), RandFlipD(keys="image", prob=0.5), RandRotate90D(keys="image", prob=0.5), CastToTypeD(keys="image", dtype=np.float32), RandZoomD(keys="image", prob=0.5, min_zoom=0.9, max_zoom=1.1), ScaleIntensityRangeD(keys="image", a_min=0.0, a_max=255.0, b_min=-1.0, b_max=1.0), ToTensorD(keys=("image", "label")), ]) valid_preprocess = Compose([ CastToTypeD(keys="image", dtype=np.float32), ScaleIntensityRangeD(keys="image", a_min=0.0, a_max=255.0, b_min=-1.0, b_max=1.0), ToTensorD(keys=("image", "label")), ]) # __________________________________________________________________________ # Create MONAI dataset train_json_info_list = load_decathlon_datalist( data_list_file_path=cfg["dataset_json"], data_list_key="training", base_dir=cfg["data_root"], ) valid_json_info_list = load_decathlon_datalist( data_list_file_path=cfg["dataset_json"], data_list_key="validation", base_dir=cfg["data_root"], ) train_dataset = PatchWSIDataset( train_json_info_list, cfg["region_size"], cfg["grid_shape"], cfg["patch_size"], train_preprocess, image_reader_name="openslide" if cfg["use_openslide"] else "cuCIM", ) valid_dataset = PatchWSIDataset( valid_json_info_list, cfg["region_size"], cfg["grid_shape"], cfg["patch_size"], valid_preprocess, image_reader_name="openslide" if cfg["use_openslide"] else "cuCIM", ) # __________________________________________________________________________ # DataLoaders train_dataloader = DataLoader(train_dataset, num_workers=cfg["num_workers"], batch_size=cfg["batch_size"], pin_memory=True) valid_dataloader = DataLoader(valid_dataset, num_workers=cfg["num_workers"], batch_size=cfg["batch_size"], pin_memory=True) # __________________________________________________________________________ # Get sample batch and some info first_sample = first(train_dataloader) if first_sample is None: raise ValueError("Fist sample is None!") print("image: ") print(" shape", first_sample["image"].shape) print(" type: ", type(first_sample["image"])) print(" dtype: ", first_sample["image"].dtype) print("labels: ") print(" shape", first_sample["label"].shape) print(" type: ", type(first_sample["label"])) print(" dtype: ", first_sample["label"].dtype) print(f"batch size: {cfg['batch_size']}") print(f"train number of batches: {len(train_dataloader)}") print(f"valid number of batches: {len(valid_dataloader)}") # -------------------------------------------------------------------------- # Deep Learning Classification Model # -------------------------------------------------------------------------- # __________________________________________________________________________ # initialize model model = TorchVisionFCModel("resnet18", num_classes=1, use_conv=True, pretrained=cfg["pretrain"]) model = model.to(device) # loss function loss_func = torch.nn.BCEWithLogitsLoss() loss_func = loss_func.to(device) # optimizer if cfg["novograd"]: optimizer = Novograd(model.parameters(), cfg["lr"]) else: optimizer = SGD(model.parameters(), lr=cfg["lr"], momentum=0.9) # AMP scaler if cfg["amp"]: cfg["amp"] = True if monai.utils.get_torch_version_tuple() >= ( 1, 6) else False else: cfg["amp"] = False scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg["n_epochs"]) # -------------------------------------------- # Ignite Trainer/Evaluator # -------------------------------------------- # Evaluator val_handlers = [ CheckpointSaver(save_dir=log_dir, save_dict={"net": model}, save_key_metric=True), StatsHandler(output_transform=lambda x: None), TensorBoardStatsHandler(log_dir=log_dir, output_transform=lambda x: None), ] val_postprocessing = Compose([ ActivationsD(keys="pred", sigmoid=True), AsDiscreteD(keys="pred", threshold=0.5) ]) evaluator = SupervisedEvaluator( device=device, val_data_loader=valid_dataloader, network=model, postprocessing=val_postprocessing, key_val_metric={ "val_acc": Accuracy(output_transform=from_engine(["pred", "label"])) }, val_handlers=val_handlers, amp=cfg["amp"], ) # Trainer train_handlers = [ LrScheduleHandler(lr_scheduler=scheduler, print_lr=True), CheckpointSaver(save_dir=cfg["logdir"], save_dict={ "net": model, "opt": optimizer }, save_interval=1, epoch_level=True), StatsHandler(tag_name="train_loss", output_transform=from_engine(["loss"], first=True)), ValidationHandler(validator=evaluator, interval=1, epoch_level=True), TensorBoardStatsHandler(log_dir=cfg["logdir"], tag_name="train_loss", output_transform=from_engine(["loss"], first=True)), ] train_postprocessing = Compose([ ActivationsD(keys="pred", sigmoid=True), AsDiscreteD(keys="pred", threshold=0.5) ]) trainer = SupervisedTrainer( device=device, max_epochs=cfg["n_epochs"], train_data_loader=train_dataloader, network=model, optimizer=optimizer, loss_function=loss_func, postprocessing=train_postprocessing, key_train_metric={ "train_acc": Accuracy(output_transform=from_engine(["pred", "label"])) }, train_handlers=train_handlers, amp=cfg["amp"], ) trainer.run()
# optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # epoch = checkpoint['epoch'] # loss = checkpoint['loss'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_ft = model_ft.to(device) #Loss Function criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized optimizer_ft = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9, weight_decay=0.0001) # optimizer_ft = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9) schedule = scheduler.CosineAnnealingLR(optimizer_ft, num_epoch, eta_min=1e-7) # checkpoint = torch.load('./models/model32/model_14_epoch.pth') # model_ft.load_state_dict(checkpoint['model_state_dict']) # optimizer_ft.load_state_dict(checkpoint['optimizer_state_dict']) # schedule = None train_model("test-refine", model_ft, dataloaders, dataset_sizes, criterion, optimizer_ft, num_epochs=num_epoch, lr=lr,
def exp(subject_id): import torch test_subj = np.r_[subject_id] print('test subj:' + str(test_subj)) #20% validation train_size = int(0.9* len(splitted['session_T'])) test_size = len(splitted['session_T']) - train_size # train_set, valid_set = torch.utils.data.random_split(splitted['session_T'], [train_size, test_size]) train_set = splitted['session_T'] test_set = splitted['session_E'] # model = Deep4Net( # n_chans, # n_classes, # input_window_samples=input_window_samples, # final_conv_length="auto", # ) from torch.utils.data import Dataset, ConcatDataset crop_size = 1000 # embedding_net = Deep4Net_origin(n_classes, n_chans, crop_size) # model = FcClfNet(embedding_net) model = ShallowFBCSPNet( n_chans, n_classes, input_window_samples=input_window_samples, final_conv_length='auto', ) from braindecode.models.util import to_dense_prediction_model, get_output_shape to_dense_prediction_model(model) n_preds_per_input = get_output_shape(model, 22, input_window_samples)[2] print("n_preds_per_input : ", n_preds_per_input) print(model) batch_size =8 epochs = 200 lr = 0.0625 * 0.01 weight_decay = 0 train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) # valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False) test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False) # Send model to GPU if cuda: model.cuda() from torch.optim import lr_scheduler import torch.optim as optim import argparse parser = argparse.ArgumentParser(description='cross subject domain adaptation') parser.add_argument('--batch-size', type=int, default=50, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=50, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser.parse_args() args.gpuidx = 0 args.seed = 0 args.use_tensorboard = False args.save_model = False optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.5 * 0.001) # scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs-1) import pandas as pd results_columns = ['test_loss', 'test_accuracy'] df = pd.DataFrame(columns=results_columns) for epochidx in range(1, epochs): print(epochidx) train_crop(10, model, device, train_loader,optimizer,scheduler,cuda, args.gpuidx) test_loss, test_score = eval_crop(model, device, test_loader) results = { 'test_loss': test_loss, 'test_accuracy': test_score} df = df.append(results, ignore_index=True) print(results) return df
def main_worker(gpu, ngpus_per_node, args): cudnn.benchmark = args.cudnn_benchmark args.gpu = gpu num_classes, train_list_name, val_list_name, test_list_name, filename_seperator, image_tmpl, filter_video, label_file = get_dataset_config( args.dataset) args.num_classes = num_classes if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.modality == 'rgb': args.input_channels = 3 elif args.modality == 'flow': args.input_channels = 2 * 5 model, arch_name = build_model(args) mean = model.mean(args.modality) std = model.std(args.modality) # overwrite mean and std if they are presented in command if args.mean is not None: if args.modality == 'rgb': if len(args.mean) != 3: raise ValueError( "When training with rgb, dim of mean must be three.") elif args.modality == 'flow': if len(args.mean) != 1: raise ValueError( "When training with flow, dim of mean must be three.") mean = args.mean if args.std is not None: if args.modality == 'rgb': if len(args.std) != 3: raise ValueError( "When training with rgb, dim of std must be three.") elif args.modality == 'flow': if len(args.std) != 1: raise ValueError( "When training with flow, dim of std must be three.") std = args.std model = model.cuda(args.gpu) model.eval() if args.show_model: if args.rank == 0: print(model) return 0 if args.pretrained is not None: if args.rank == 0: print("=> using pre-trained model '{}'".format(arch_name)) checkpoint = torch.load(args.pretrained, map_location='cpu') model.load_state_dict(checkpoint['state_dict'], strict=False) del checkpoint # dereference seems crucial torch.cuda.empty_cache() else: if args.rank == 0: print("=> creating model '{}'".format(arch_name)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # the batch size should be divided by number of nodes as well args.batch_size = int(args.batch_size / args.world_size) args.workers = int(args.workers / ngpus_per_node) if args.sync_bn: process_group = torch.distributed.new_group( list(range(args.world_size))) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model, process_group) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs # assign rank to 0 model = torch.nn.DataParallel(model).cuda() args.rank = 0 # define loss function (criterion) and optimizer train_criterion = nn.CrossEntropyLoss().cuda(args.gpu) val_criterion = nn.CrossEntropyLoss().cuda(args.gpu) # Data loading code val_list = os.path.join(args.datadir, val_list_name) val_augmentor = get_augmentor( False, args.input_size, scale_range=args.scale_range, mean=mean, std=std, disable_scaleup=args.disable_scaleup, threed_data=args.threed_data, is_flow=True if args.modality == 'flow' else False, version=args.augmentor_ver) val_dataset = VideoDataSet(args.datadir, val_list, args.groups, args.frames_per_group, num_clips=args.num_clips, modality=args.modality, image_tmpl=image_tmpl, dense_sampling=args.dense_sampling, transform=val_augmentor, is_train=False, test_mode=False, seperator=filename_seperator, filter_video=filter_video) val_loader = build_dataflow(val_dataset, is_train=False, batch_size=args.batch_size, workers=args.workers, is_distributed=args.distributed) log_folder = os.path.join(args.logdir, arch_name) if args.rank == 0: if not os.path.exists(log_folder): os.makedirs(log_folder) if args.evaluate: val_top1, val_top5, val_losses, val_speed = validate(val_loader, model, val_criterion, gpu_id=args.gpu) if args.rank == 0: logfile = open(os.path.join(log_folder, 'evaluate_log.log'), 'a') print( 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch' .format(args.input_size, val_losses, val_top1, val_top5, val_speed * 1000.0), flush=True) print( 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch' .format(args.input_size, val_losses, val_top1, val_top5, val_speed * 1000.0), flush=True, file=logfile) return train_list = os.path.join(args.datadir, train_list_name) train_augmentor = get_augmentor( True, args.input_size, scale_range=args.scale_range, mean=mean, std=std, disable_scaleup=args.disable_scaleup, threed_data=args.threed_data, is_flow=True if args.modality == 'flow' else False, version=args.augmentor_ver) train_dataset = VideoDataSet(args.datadir, train_list, args.groups, args.frames_per_group, num_clips=args.num_clips, modality=args.modality, image_tmpl=image_tmpl, dense_sampling=args.dense_sampling, transform=train_augmentor, is_train=True, test_mode=False, seperator=filename_seperator, filter_video=filter_video) train_loader = build_dataflow(train_dataset, is_train=True, batch_size=args.batch_size, workers=args.workers, is_distributed=args.distributed) sgd_polices = model.parameters() optimizer = torch.optim.SGD(sgd_polices, args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.lr_scheduler == 'step': scheduler = lr_scheduler.StepLR(optimizer, args.lr_steps[0], gamma=0.1) elif args.lr_scheduler == 'multisteps': scheduler = lr_scheduler.MultiStepLR(optimizer, args.lr_steps, gamma=0.1) elif args.lr_scheduler == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0) elif args.lr_scheduler == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True) best_top1 = 0.0 # optionally resume from a checkpoint if args.resume: if args.rank == 0: logfile = open(os.path.join(log_folder, 'log.log'), 'a') if os.path.isfile(args.resume): if args.rank == 0: print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume, map_location='cpu') else: checkpoint = torch.load(args.resume, map_location='cuda:{}'.format( args.gpu)) args.start_epoch = checkpoint['epoch'] best_top1 = checkpoint['best_top1'] if args.gpu is not None: if not isinstance(best_top1, float): best_top1 = best_top1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) try: scheduler.load_state_dict(checkpoint['scheduler']) except: pass if args.rank == 0: print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint # dereference seems crucial torch.cuda.empty_cache() else: raise ValueError("Checkpoint is not found: {}".format(args.resume)) else: if os.path.exists(os.path.join(log_folder, 'log.log')) and args.rank == 0: shutil.copyfile( os.path.join(log_folder, 'log.log'), os.path.join(log_folder, 'log.log.{}'.format(int(time.time())))) if args.rank == 0: logfile = open(os.path.join(log_folder, 'log.log'), 'w') if args.rank == 0: command = " ".join(sys.argv) tensorboard_logger.configure(os.path.join(log_folder)) print(command, flush=True) print(args, flush=True) print(model, flush=True) print(command, file=logfile, flush=True) print(args, file=logfile, flush=True) if args.resume == '' and args.rank == 0: print(model, file=logfile, flush=True) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_top1, train_top5, train_losses, train_speed, speed_data_loader, train_steps = \ train(train_loader, model, train_criterion, optimizer, epoch + 1, display=args.print_freq, clip_gradient=args.clip_gradient, gpu_id=args.gpu, rank=args.rank) if args.distributed: dist.barrier() # evaluate on validation set val_top1, val_top5, val_losses, val_speed = validate(val_loader, model, val_criterion, gpu_id=args.gpu) # update current learning rate if args.lr_scheduler == 'plateau': scheduler.step(val_losses) else: scheduler.step() if args.distributed: dist.barrier() # only logging at rank 0 if args.rank == 0: print( 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t' 'Speed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch'.format( epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, speed_data_loader * 1000.0), file=logfile, flush=True) print( 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t' 'Speed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch'.format( epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, speed_data_loader * 1000.0), flush=True) print( 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t' 'Speed: {:.2f} ms/batch'.format(epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), file=logfile, flush=True) print( 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\t' 'Speed: {:.2f} ms/batch'.format(epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), flush=True) # remember best prec@1 and save checkpoint is_best = val_top1 > best_top1 best_top1 = max(val_top1, best_top1) save_dict = { 'epoch': epoch + 1, 'arch': arch_name, 'state_dict': model.state_dict(), 'best_top1': best_top1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } save_checkpoint(save_dict, is_best, filepath=log_folder) try: # get_lr get all lrs for every layer of current epoch, assume the lr for all layers are identical lr = scheduler.optimizer.param_groups[0]['lr'] except Exception as e: lr = None if lr is not None: tensorboard_logger.log_value('learning-rate', lr, epoch + 1) tensorboard_logger.log_value('val-top1', val_top1, epoch + 1) tensorboard_logger.log_value('val-loss', val_losses, epoch + 1) tensorboard_logger.log_value('train-top1', train_top1, epoch + 1) tensorboard_logger.log_value('train-loss', train_losses, epoch + 1) tensorboard_logger.log_value('best-val-top1', best_top1, epoch + 1) if args.distributed: dist.barrier() if args.rank == 0: logfile.close()
print('Test Acc: {:.4f}'.format(epoch_acc)) #합성곱 신경망 미세조정 #model_ft = models.alexnet(pretrained=True) #num_ftrs = model_ft.classifier[6].in_features model_ft = models.vgg16(pretrained=False) num_ftrs = model_ft.classifier[6].in_features # features = list(model_ft.classifier.children())[:-1] # Remove last layer # features.extend([nn.Linear(num_ftrs, len(class_names))]) # Add our layer with 4 outputs # model_ft = nn.Sequential(*features) # Replace the model classifier model_ft.classifier[6] = nn.Linear(num_ftrs, 196) #Linear(입력, 출력) 폴더갯수 200개씩임! model_ft = model_ft.to(device) criterion = nn.CrossEntropyLoss() optimizer_ft = optim.Adam(model_ft.parameters(), lr=1e-4) exp_lr_scheduler = lr_scheduler.CosineAnnealingLR(optimizer_ft, 100) #100이 뭐지 #학습 및 평가하기 model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=100) #visualize_model(model_ft) #test_model(model_ft, criterion, optimizer_ft) sys.exit(0)
def get_scheduler(cfg, optimizer): scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.general.epochs, eta_min=cfg.optimizer.lr) return scheduler
def exp(subject_id): PATH = '../datasets/' with open(PATH + 'bcic_datasets_prep.pkl', 'rb') as f: data = pickle.load(f) test_subj = np.r_[subject_id - 1] print('test subj:' + str(test_subj)) train_subj = np.setdiff1d(np.r_[0:9], test_subj) #rearange subject label new_subj_id = 0 for ids in train_subj: data[ids].subj_id = new_subj_id new_subj_id += 1 print(data[ids].subj_id) tr = [] val = [] #10%씩 떼어내서 val만듬 for ids in train_subj: train_size = int(0.9 * len(data[ids])) test_size = len(data[ids]) - train_size tr_i = torch.utils.data.Subset(data[ids], indices=train_indices) val_i = torch.utils.data.Subset(data[ids], indices=val_indices) # tr_i, val_i = torch.utils.data.random_split(data[ids], [train_size, test_size]) tr.append(tr_i) val.append(val_i) train_set = torch.utils.data.ConcatDataset(tr) valid_set = torch.utils.data.ConcatDataset(val) test_set = torch.utils.data.ConcatDataset([data[ids] for ids in test_subj]) crop_size = 1125 embedding_net = EEGNet_v2(n_classes, 22, 1125) model = FcClfNet_mult(embedding_net) discriminator = Discriminator([model.embedding_net.num_hidden, 8], grl=True, reverse=True) print(model) batch_size = 64 epochs = 100 # For deep4 they should be: lr = 1 * 0.01 weight_decay = 0.5 * 0.001 batch_size = 64 n_epochs = 200 train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False) test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False) # Send model to GPU if cuda: model.cuda(device=device) discriminator.cuda(device=device) from torch.optim import lr_scheduler import torch.optim as optim import argparse parser = argparse.ArgumentParser( description='cross subject domain adaptation') parser.add_argument('--batch-size', type=int, default=50, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=50, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser.parse_args() args.gpuidx = 1 args.seed = 0 args.use_tensorboard = False args.save_model = False # optimizer_C = optim.AdamW(list(model.embedding_net.parameters())+list(model.class_classifier.parameters()), lr=0.001, weight_decay=0.5 * 0.001) # optimizer_D = optim.AdamW(discriminator.parameters(), lr=0.001, weight_decay=0.5 * 0.001) optimizer_C = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.5 * 0.001) optimizer_D = optim.AdamW(discriminator.parameters(), lr=0.01, weight_decay=0.5 * 0.001) scheduler_C = lr_scheduler.CosineAnnealingLR(optimizer_C, T_max=100) scheduler_D = lr_scheduler.CosineAnnealingLR(optimizer_D, T_max=100) scheduler = [scheduler_C, scheduler_D] # scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer_C, T_0 = 50) # # #test lr # lr = [] # for i in range(200): # scheduler.step() # lr.append(scheduler.get_lr()) # # import matplotlib.pyplot as plt # plt.plot(lr) import pandas as pd results_columns = [ 'val_loss', 'test_loss', 'val_accuracy', 'test_accuracy' ] df = pd.DataFrame(columns=results_columns) for epochidx in range(1, epochs): print(epochidx) train_mult(10, model, discriminator, device, train_loader, optimizer_C, optimizer_D, scheduler, cuda, args.gpuidx, epoch=epochidx) val_loss, val_score = eval_mult(model, device, valid_loader) test_loss, test_score = eval_mult(model, device, test_loader) results = { 'val_loss': val_loss, 'test_loss': test_loss, 'val_accuracy': val_score, 'test_accuracy': test_score } df = df.append(results, ignore_index=True) print(results) return df
embedding_net = EmbeddingNet() model = ClassificationNet(embedding_net, nclasses) if cuda: model = nn.DataParallel(model).cuda() loss_fn = loss_fn.cuda() optimizer = optim.SGD([{ 'params': model.parameters() }, { 'params': loss_fn.parameters() }], lr=lr, nesterov=True, momentum=0.9, weight_decay=1e-4) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 20, eta_min=1e-5, last_epoch=-1) fit(dataset_tr, model, loss_fn, optimizer, scheduler, niterations, cuda, log_interval, metrics=[AccumulatedAccuracyMetric()], mining_tech='Doppleganger')
def train_model_snapshot(model, criterion, eval_criterion, lr, dataloaders, dataset_sizes, device, num_cycles, num_epochs_per_cycle): since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 best_loss = 1000000.0 model_w_arr = [] for cycle in range(num_cycles): #initialize optimizer and scheduler each cycle #optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) optimizer = optim.Adam( [ { "params": model.model_ft.parameters(), "lr": lr * 3 / time_steps }, #{"params": model.fc1.parameters(), "lr": lr}, { "params": model.fc.parameters(), "lr": lr } ], lr=lr) scheduler = lr_scheduler.CosineAnnealingLR( optimizer, num_epochs_per_cycle * len(dataloaders['train'])) for epoch in range(num_epochs_per_cycle): print('Cycle {}: Epoch {}/{}'.format(cycle, epoch, num_epochs_per_cycle - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data. for inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) loss = criterion(outputs, labels.reshape(-1, 1)) eval_loss = eval_criterion(outputs, labels.reshape(-1, 1)) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() scheduler.step() # statistics running_loss += eval_loss.item() * inputs.size(0) epoch_loss = np.sqrt(running_loss / dataset_sizes[phase]) print('{} Loss: {:.4f}'.format(phase, epoch_loss)) # deep copy the model if phase == 'val' and epoch_loss < best_loss: best_loss = epoch_loss best_model_wts = copy.deepcopy(model.state_dict()) print() # deep copy snapshot model_w_arr.append(copy.deepcopy(model.state_dict())) ensemble_loss = 0.0 #predict on validation using snapshots for inputs, labels in dataloaders['val']: inputs = inputs.to(device) labels = labels.to(device) # forward # track history if only in train pred = torch.zeros((inputs.shape[0], 1), dtype=torch.float32).to(device) for weights in model_w_arr: model.load_state_dict(weights) model.eval() outputs = model(inputs) pred += outputs pred /= num_cycles eval_loss = eval_criterion(pred, labels.reshape(-1, 1)) ensemble_loss += eval_loss.item() * inputs.size(0) ensemble_loss /= dataset_sizes['val'] ensemble_loss = np.sqrt(ensemble_loss) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Ensemble Loss : {:4f}, Best val Loss: {:4f}'.format( ensemble_loss, best_loss)) return model_w_arr, ensemble_loss, best_loss
from __future__ import print_function
def main_worker(gpu, ngpus_per_node, config): """ Args: gpu: ngpus_per_node: config: """ global CONFIG, best_acc1 CONFIG, best_acc1 = config, 0 train_set = config.dataset_type(CONFIG.dataset_path, SplitEnum.training) val_set = config.dataset_type(CONFIG.dataset_path, SplitEnum.validation) if CONFIG.distributed: if CONFIG.dist_url == "env://" and CONFIG.rank == -1: CONFIG.rank = int(os.environ["RANK"]) if CONFIG.multiprocessing_distributed: CONFIG.rank = CONFIG.rank * ngpus_per_node + gpu distributed.init_process_group( backend=CONFIG.dist_backend, init_method=CONFIG.dist_url, world_size=CONFIG.world_size, rank=CONFIG.rank, ) model = make_san( self_attention_type=SelfAttentionTypeEnum(CONFIG.self_attention_type), layers=CONFIG.layers, kernels=CONFIG.kernels, num_classes=train_set.response_shape[0], ) criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.ignore_label) optimiser = torch.optim.SGD( model.parameters(), lr=CONFIG.base_lr, momentum=CONFIG.momentum, weight_decay=CONFIG.weight_decay, ) if CONFIG.scheduler == "step": scheduler = lr_scheduler.MultiStepLR( optimiser, milestones=CONFIG.step_epochs, gamma=0.1 ) elif CONFIG.scheduler == "cosine": scheduler = lr_scheduler.CosineAnnealingLR(optimiser, T_max=CONFIG.epochs) if is_main_process(): global logger, writer logger = get_logger() writer = TensorBoardPytorchWriter(str(CONFIG.save_path)) logger.info(CONFIG) logger.info("=> creating model ...") logger.info(f"Classes: {train_set.response_shape[0]}") logger.info(model) if CONFIG.distributed: torch.cuda.set_device(gpu) CONFIG.batch_size = int(CONFIG.batch_size / ngpus_per_node) CONFIG.batch_size_val = int(CONFIG.batch_size_val / ngpus_per_node) CONFIG.workers = int((CONFIG.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model.cuda(), device_ids=[gpu] ) else: model = torch.nn.DataParallel(model.cuda()) if CONFIG.weight: if Path(CONFIG.weight).is_file(): if is_main_process(): global logger logger.info(f"=> loading weight '{CONFIG.weight}'") checkpoint = torch.load(CONFIG.weight) model.load_state_dict(checkpoint["state_dict"]) if is_main_process(): global logger logger.info(f"=> loaded weight '{CONFIG.weight}'") else: if is_main_process(): global logger logger.info(f"=> no weight found at '{CONFIG.weight}'") if CONFIG.resume: if Path(CONFIG.resume).is_file(): if is_main_process(): global logger logger.info(f"=> loading checkpoint '{CONFIG.resume}'") checkpoint = torch.load( CONFIG.resume, map_location=lambda storage, loc: storage.cuda(gpu) ) CONFIG.start_epoch = checkpoint["epoch"] best_acc1 = checkpoint["top1_val"] model.load_state_dict(checkpoint["state_dict"]) optimiser.load_state_dict(checkpoint["optimiser"]) scheduler.load_state_dict(checkpoint["scheduler"]) if is_main_process(): global logger logger.info( f"=> loaded checkpoint '{CONFIG.resume}' (epoch {checkpoint['epoch']})" ) else: if is_main_process(): global logger logger.info(f"=> no checkpoint found at '{CONFIG.resume}'") if CONFIG.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_set) val_sampler = torch.utils.data.distributed.DistributedSampler(val_set) else: train_sampler = None val_sampler = None train_loader = torch.utils.data.DataLoader( train_set, batch_size=CONFIG.batch_size, shuffle=(train_sampler is None), num_workers=CONFIG.workers, pin_memory=True, sampler=train_sampler, ) val_loader = torch.utils.data.DataLoader( val_set, batch_size=CONFIG.batch_size_val, shuffle=False, num_workers=CONFIG.workers, pin_memory=True, sampler=val_sampler, ) for epoch in range(CONFIG.start_epoch, CONFIG.epochs): if CONFIG.distributed: train_sampler.set_epoch(epoch) ( loss_train, mIoU_train, mAcc_train, allAcc_train, top1_train, top5_train, ) = train(train_loader, model, criterion, optimiser, epoch) loss_val, mIoU_val, mAcc_val, allAcc_val, top1_val, top5_val = validate( val_loader, model, criterion ) scheduler.step() epoch_log = epoch + 1 if is_main_process(): global writer writer.scalar("loss_train", loss_train, epoch_log) writer.scalar("mIoU_train", mIoU_train, epoch_log) writer.scalar("mAcc_train", mAcc_train, epoch_log) writer.scalar("allAcc_train", allAcc_train, epoch_log) writer.scalar("top1_train", top1_train, epoch_log) writer.scalar("top5_train", top5_train, epoch_log) writer.scalar("loss_val", loss_val, epoch_log) writer.scalar("mIoU_val", mIoU_val, epoch_log) writer.scalar("mAcc_val", mAcc_val, epoch_log) writer.scalar("allAcc_val", allAcc_val, epoch_log) writer.scalar("top1_val", top1_val, epoch_log) writer.scalar("top5_val", top5_val, epoch_log) if (epoch_log % CONFIG.save_freq == 0) and is_main_process(): filename = CONFIG.save_path / "train_epoch_" + str(epoch_log) + ".pth" global logger logger.info("Saving checkpoint to: " + filename) torch.save( { "epoch": epoch_log, "state_dict": model.state_dict(), "optimiser": torch.optim.Optimizer.state_dict(), "scheduler": scheduler.state_dict(), "top1_val": top1_val, "top5_val": top5_val, }, filename, ) if top1_val > best_acc1: best_acc1 = top1_val shutil.copyfile(filename, CONFIG.save_path / "model_best.pth") if epoch_log / CONFIG.save_freq > 2: deletename = ( CONFIG.save_path / f"train_epoch_{str(epoch_log - CONFIG.save_freq * 2)}.pth" ) os.remove(deletename)
[transforms.ToTensor(), normalize])) train_dataset.classes = cifar_load_meta(dataset_root, base_folder, 'cifar10') train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True) validate_loader = torch.utils.data.DataLoader(dataset=validate_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True) # schedulers config scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 160) # criterisons config criterion = nn.CrossEntropyLoss() # plugins config plugins = [] plugins.append(LossMonitor()) plugins.append(TopKAccuracy(topk=(1, 5))) plugins.append(IterationSummaryMonitor()) plugins.append(DistributionOfBNMonitor()) plugins.append(ClassAccuracy()) def dataforward(self, data, target):
if torch.cuda.is_available(): model = model.cuda() if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) trainloader, testloader = get_cifar_loaders(args.data_loc) optimizer = optim.SGD( [w for name, w in model.named_parameters() if not "mask" in name], lr=args.lr, momentum=0.9, weight_decay=args.weight_decay, ) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1e-10) criterion = nn.CrossEntropyLoss() error_history = [] for epoch in tqdm(range(args.epochs)): train(model, trainloader, criterion, optimizer) validate( model, epoch, testloader, criterion, checkpoint=args.checkpoint if epoch != 2 else args.checkpoint + "_init", ) scheduler.step()
def main(): args = parse_args() if args.name is None: args.name = '%s_%s' % (args.arch, datetime.now().strftime('%m%d%H')) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('- %s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('- %s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) if args.loss == 'CrossEntropyLoss': criterion = nn.CrossEntropyLoss().cuda() elif args.loss == 'FocalLoss': criterion = FocalLoss().cuda() elif args.loss == 'MSELoss': criterion = nn.MSELoss().cuda() elif args.loss == 'multitask': criterion = { 'classification': nn.CrossEntropyLoss().cuda(), 'regression': nn.MSELoss().cuda(), } else: raise NotImplementedError if args.pred_type == 'classification': num_outputs = 5 elif args.pred_type == 'regression': num_outputs = 1 elif args.loss == 'multitask': num_outputs = 6 else: raise NotImplementedError cudnn.benchmark = True model = get_model(model_name=args.arch, num_outputs=num_outputs, freeze_bn=args.freeze_bn, dropout_p=args.dropout_p) train_transform = [] train_transform = transforms.Compose([ transforms.Resize((args.img_size, args.img_size)), transforms.RandomAffine( degrees=(args.rotate_min, args.rotate_max) if args.rotate else 0, translate=(args.translate_min, args.translate_max) if args.translate else None, scale=(args.rescale_min, args.rescale_max) if args.rescale else None, shear=(args.shear_min, args.shear_max) if args.shear else None, ), transforms.CenterCrop(args.input_size), transforms.RandomHorizontalFlip(p=0.5 if args.flip else 0), transforms.RandomVerticalFlip(p=0.5 if args.flip else 0), transforms.ColorJitter( brightness=0, contrast=args.contrast, saturation=0, hue=0), RandomErase( prob=args.random_erase_prob if args.random_erase else 0, sl=args.random_erase_sl, sh=args.random_erase_sh, r=args.random_erase_r), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) val_transform = transforms.Compose([ transforms.Resize((args.img_size, args.input_size)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) # data loading code if 'diabetic_retinopathy' in args.train_dataset: diabetic_retinopathy_dir = preprocess( 'diabetic_retinopathy', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) diabetic_retinopathy_df = pd.read_csv('inputs/diabetic-retinopathy-resized/trainLabels.csv') diabetic_retinopathy_img_paths = \ diabetic_retinopathy_dir + '/' + diabetic_retinopathy_df['image'].values + '.jpeg' diabetic_retinopathy_labels = diabetic_retinopathy_df['level'].values if 'aptos2019' in args.train_dataset: aptos2019_dir = preprocess( 'aptos2019', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) aptos2019_df = pd.read_csv('inputs/train.csv') aptos2019_img_paths = aptos2019_dir + '/' + aptos2019_df['id_code'].values + '.png' aptos2019_labels = aptos2019_df['diagnosis'].values if args.train_dataset == 'aptos2019': skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41) img_paths = [] labels = [] for fold, (train_idx, val_idx) in enumerate(skf.split(aptos2019_img_paths, aptos2019_labels)): img_paths.append((aptos2019_img_paths[train_idx], aptos2019_img_paths[val_idx])) labels.append((aptos2019_labels[train_idx], aptos2019_labels[val_idx])) elif args.train_dataset == 'diabetic_retinopathy': img_paths = [(diabetic_retinopathy_img_paths, aptos2019_img_paths)] labels = [(diabetic_retinopathy_labels, aptos2019_labels)] elif 'diabetic_retinopathy' in args.train_dataset and 'aptos2019' in args.train_dataset: skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41) img_paths = [] labels = [] for fold, (train_idx, val_idx) in enumerate(skf.split(aptos2019_img_paths, aptos2019_labels)): img_paths.append((np.hstack((aptos2019_img_paths[train_idx], diabetic_retinopathy_img_paths)), aptos2019_img_paths[val_idx])) labels.append((np.hstack((aptos2019_labels[train_idx], diabetic_retinopathy_labels)), aptos2019_labels[val_idx])) # else: # raise NotImplementedError if args.pseudo_labels: test_df = pd.read_csv('probs/%s.csv' % args.pseudo_labels) test_dir = preprocess( 'test', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) test_img_paths = test_dir + '/' + test_df['id_code'].values + '.png' test_labels = test_df['diagnosis'].values for fold in range(len(img_paths)): img_paths[fold] = (np.hstack((img_paths[fold][0], test_img_paths)), img_paths[fold][1]) labels[fold] = (np.hstack((labels[fold][0], test_labels)), labels[fold][1]) if 'messidor' in args.train_dataset: test_dir = preprocess( 'messidor', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) folds = [] best_losses = [] best_scores = [] for fold, ((train_img_paths, val_img_paths), (train_labels, val_labels)) in enumerate(zip(img_paths, labels)): print('Fold [%d/%d]' %(fold+1, len(img_paths))) if os.path.exists('models/%s/model_%d.pth' % (args.name, fold+1)): log = pd.read_csv('models/%s/log_%d.csv' %(args.name, fold+1)) best_loss, best_score = log.loc[log['val_loss'].values.argmin(), ['val_loss', 'val_score']].values folds.append(str(fold + 1)) best_losses.append(best_loss) best_scores.append(best_score) continue if args.remove_duplicate: md5_df = pd.read_csv('inputs/strMd5.csv') duplicate_img_paths = aptos2019_dir + '/' + md5_df[(md5_df.strMd5_count > 1) & (~md5_df.diagnosis.isnull())]['id_code'].values + '.png' print(duplicate_img_paths) for duplicate_img_path in duplicate_img_paths: train_labels = train_labels[train_img_paths != duplicate_img_path] train_img_paths = train_img_paths[train_img_paths != duplicate_img_path] val_labels = val_labels[val_img_paths != duplicate_img_path] val_img_paths = val_img_paths[val_img_paths != duplicate_img_path] # train train_set = Dataset( train_img_paths, train_labels, transform=train_transform) _, class_sample_counts = np.unique(train_labels, return_counts=True) # print(class_sample_counts) # weights = 1. / torch.tensor(class_sample_counts, dtype=torch.float) # weights = np.array([0.2, 0.1, 0.6, 0.1, 0.1]) # samples_weights = weights[train_labels] # sampler = WeightedRandomSampler( # weights=samples_weights, # num_samples=11000, # replacement=False) train_loader = torch.utils.data.DataLoader( train_set, batch_size=args.batch_size, shuffle=False if args.class_aware else True, num_workers=4, sampler=sampler if args.class_aware else None) val_set = Dataset( val_img_paths, val_labels, transform=val_transform) val_loader = torch.utils.data.DataLoader( val_set, batch_size=args.batch_size, shuffle=False, num_workers=4) # create model model = get_model(model_name=args.arch, num_outputs=num_outputs, freeze_bn=args.freeze_bn, dropout_p=args.dropout_p) model = model.cuda() if args.pretrained_model is not None: model.load_state_dict(torch.load('models/%s/model_%d.pth' % (args.pretrained_model, fold+1))) # print(model) if args.optimizer == 'Adam': optimizer = optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'AdamW': optimizer = optim.AdamW( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'RAdam': optimizer = RAdam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.scheduler == 'CosineAnnealingLR': scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs, eta_min=args.min_lr) elif args.scheduler == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=args.factor, patience=args.patience, verbose=1, min_lr=args.min_lr) log = pd.DataFrame(index=[], columns=[ 'epoch', 'loss', 'score', 'val_loss', 'val_score' ]) log = { 'epoch': [], 'loss': [], 'score': [], 'val_loss': [], 'val_score': [], } best_loss = float('inf') best_score = 0 for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch + 1, args.epochs)) # train for one epoch train_loss, train_score = train( args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_loss, val_score = validate(args, val_loader, model, criterion) if args.scheduler == 'CosineAnnealingLR': scheduler.step() elif args.scheduler == 'ReduceLROnPlateau': scheduler.step(val_loss) print('loss %.4f - score %.4f - val_loss %.4f - val_score %.4f' % (train_loss, train_score, val_loss, val_score)) log['epoch'].append(epoch) log['loss'].append(train_loss) log['score'].append(train_score) log['val_loss'].append(val_loss) log['val_score'].append(val_score) pd.DataFrame(log).to_csv('models/%s/log_%d.csv' % (args.name, fold+1), index=False) if val_loss < best_loss: torch.save(model.state_dict(), 'models/%s/model_%d.pth' % (args.name, fold+1)) best_loss = val_loss best_score = val_score print("=> saved best model") print('val_loss: %f' % best_loss) print('val_score: %f' % best_score) folds.append(str(fold + 1)) best_losses.append(best_loss) best_scores.append(best_score) results = pd.DataFrame({ 'fold': folds + ['mean'], 'best_loss': best_losses + [np.mean(best_losses)], 'best_score': best_scores + [np.mean(best_scores)], }) print(results) results.to_csv('models/%s/results.csv' % args.name, index=False) torch.cuda.empty_cache() if not args.cv: break
def main(): args = vars(parse_args_func()) #config_file = "../configs/config_SN7.json" config_file = args['config'] # "../configs/config_v1.json" config_dict = json.loads(open(config_file, 'rt').read()) #config_dict = json.loads(open(sys.argv[1], 'rt').read()) file_dict = config_dict['file_path'] config = config_dict['opt_config'] input_folder = file_dict['input_path'] # '../inputs' checkpoint_folder = file_dict['checkpoint_path'] # '../checkpoint' model_folder = file_dict['model_path'] # '../models' if 'False' in config['deep_supervision']: config['deep_supervision'] = False else: config['deep_supervision'] = True if 'False' in config['nesterov']: config['nesterov'] = False else: config['nesterov'] = True if 'None' in config['name']: config['name'] = None if config['name'] is None: config['name'] = '%s_%s_segmodel' % (config['dataset'], config['arch']) os.makedirs(os.path.join(model_folder, '%s' % config['name']), exist_ok=True) if not os.path.isdir(checkpoint_folder): os.mkdir(checkpoint_folder) log_name = config['name'] log_dir = os.path.join(checkpoint_folder, log_name) writer = SummaryWriter(logdir=log_dir) print('-' * 20) for key in config: print('%s: %s' % (key, config[key])) print('-' * 20) with open(os.path.join(model_folder, '%s/config.yml' % config['name']), 'w') as f: yaml.dump(config, f) # define loss function (criterion) if config['loss'] == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss().cuda() else: criterion = losses.__dict__[config['loss']]().cuda() cudnn.benchmark = True # create model print("=> creating model %s" % config['arch']) model = archs.__dict__[config['arch']](config['num_classes'], config['input_channels'], config['deep_supervision']) if 'False' in config['resume']: config['resume'] = False else: config['resume'] = True resume_flag = False if resume_flag == True: save_path = os.path.join(model_folder, config['name'], 'model.pth') weights = torch.load(save_path) model.load_state_dict(weights) name_yaml = config['name'] with open(os.path.join(model_folder, '%s/config.yml' % name_yaml), 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) #start_epoch = config['epochs'] start_epoch = 0 else: start_epoch = 0 model = model.cuda() if 'effnet' in config['arch']: eff_flag = True else: eff_flag = False if eff_flag == True: cnn_subs = list(model.encoder.eff_conv.children())[1:] #cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] #cnn_params = [item for sublist in cnn_params for item in sublist] summary(model, (config['input_channels'], config['input_w'], config['input_h'])) params = filter(lambda p: p.requires_grad, model.parameters()) if eff_flag == True: params = list(params) + list(model.encoder.conv_a.parameters()) model = torch.nn.DataParallel(model) if config['optimizer'] == 'Adam': optimizer = optim.Adam(params, lr=config['lr'], weight_decay=config['weight_decay']) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(params, lr=config['lr'], momentum=config['momentum'], nesterov=config['nesterov'], weight_decay=config['weight_decay']) else: raise NotImplementedError if eff_flag == True: cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] cnn_params = [item for sublist in cnn_params for item in sublist] cnn_optimizer = torch.optim.Adam(cnn_params, lr=0.001, weight_decay=config['weight_decay']) #cnn_optimizer = None else: cnn_optimizer = None if config['optimizer'] == 'SGD': if config['scheduler'] == 'CosineAnnealingLR': scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=config['epochs'], eta_min=config['min_lr']) elif config['scheduler'] == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, factor=config['factor'], patience=config['patience'], verbose=1, min_lr=config['min_lr']) elif config['scheduler'] == 'MultiStepLR': scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[int(e) for e in config['milestones'].split(',')], gamma=config['gamma']) elif config['scheduler'] == 'ConstantLR': scheduler = None else: raise NotImplementedError else: scheduler = None # Data loading code img_ids = glob( os.path.join(input_folder, config['dataset'], 'images', 'training', '*' + config['img_ext'])) train_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids] #img_dir = os.path.join(input_folder, config['dataset'], 'images', 'training') #mask_dir = os.path.join(input_folder, config['dataset'], 'annotations', 'training') #train_image_mask = image_to_afile(img_dir, mask_dir, None, train_img_ids, config) img_ids = glob( os.path.join(input_folder, config['val_dataset'], 'images', 'validation', '*' + config['img_ext'])) val_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids] img_ids = glob( os.path.join(input_folder, config['val_dataset'], 'images', 'test', '*' + config['img_ext'])) test_img_ids = [os.path.splitext(os.path.basename(p))[0] for p in img_ids] mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] train_transform = Compose([ #transforms.RandomScale ([config['scale_min'], config['scale_max']]), #transforms.RandomRotate90(), transforms.Rotate([config['rotate_min'], config['rotate_max']], value=mean, mask_value=0), transforms.Flip(), #transforms.HorizontalFlip (), transforms.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=10, val_shift_limit=10), transforms.RandomBrightnessContrast(brightness_limit=0.10, contrast_limit=0.10, brightness_by_max=True), transforms.Resize(config['input_h'], config['input_w']), transforms.Normalize(mean=mean, std=std), ]) val_transform = Compose([ transforms.Resize(config['input_h'], config['input_w']), transforms.Normalize(mean=mean, std=std), ]) train_dataset = Dataset(img_ids=train_img_ids, img_dir=os.path.join(input_folder, config['dataset'], 'images', 'training'), mask_dir=os.path.join(input_folder, config['dataset'], 'annotations', 'training'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], input_channels=config['input_channels'], transform=train_transform, from_file=None) val_dataset = Dataset(img_ids=val_img_ids, img_dir=os.path.join(input_folder, config['val_dataset'], 'images', 'validation'), mask_dir=os.path.join(input_folder, config['val_dataset'], 'annotations', 'validation'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], input_channels=config['input_channels'], transform=val_transform, from_file=None) test_dataset = Dataset(img_ids=test_img_ids, img_dir=os.path.join(input_folder, config['val_dataset'], 'images', 'test'), mask_dir=os.path.join(input_folder, config['val_dataset'], 'annotations', 'test'), img_ext=config['img_ext'], mask_ext=config['mask_ext'], num_classes=config['num_classes'], input_channels=config['input_channels'], transform=val_transform, from_file=None) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=config['num_workers'], drop_last=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, #config['batch_size'], shuffle=False, num_workers=config['num_workers'], drop_last=False) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=1, #config['batch_size'], shuffle=False, num_workers=config['num_workers'], drop_last=False) log = OrderedDict([ ('epoch', []), ('lr', []), ('loss', []), ('iou', []), ('dice', []), ('val_loss', []), ('val_iou', []), ('val_dice', []), ]) best_iou = 0 trigger = 0 Best_dice = 0 iou_AtBestDice = 0 for epoch in range(start_epoch, config['epochs']): print('{:s} Epoch [{:d}/{:d}]'.format(config['arch'], epoch, config['epochs'])) # train for one epoch train_log = train(epoch, config, train_loader, model, criterion, optimizer, cnn_optimizer) if config['optimizer'] == 'SGD': if config['scheduler'] == 'CosineAnnealingLR': scheduler.step() elif config['scheduler'] == 'ReduceLROnPlateau': scheduler.step(val_log['loss']) elif config['scheduler'] == 'MultiStepLR': scheduler.step() # evaluate on validation set val_log = validate(config, val_loader, model, criterion) test_log = validate(config, test_loader, model, criterion) if Best_dice < test_log['dice']: Best_dice = test_log['dice'] iou_AtBestDice = test_log['iou'] print( 'loss %.4f - iou %.4f - dice %.4f - val_loss %.4f - val_iou %.4f - val_dice %.4f - test_iou %.4f - test_dice %.4f - Best_dice %.4f - iou_AtBestDice %.4f' % (train_log['loss'], train_log['iou'], train_log['dice'], val_log['loss'], val_log['iou'], val_log['dice'], test_log['iou'], test_log['dice'], Best_dice, iou_AtBestDice)) save_tensorboard(writer, train_log, val_log, test_log, epoch) log['epoch'].append(epoch) log['lr'].append(config['lr']) log['loss'].append(train_log['loss']) log['iou'].append(train_log['iou']) log['dice'].append(train_log['dice']) log['val_loss'].append(val_log['loss']) log['val_iou'].append(val_log['iou']) log['val_dice'].append(val_log['dice']) pd.DataFrame(log).to_csv(os.path.join(model_folder, '%s/log.csv' % config['name']), index=False) trigger += 1 if val_log['iou'] > best_iou: torch.save( model.state_dict(), os.path.join(model_folder, '%s/model.pth' % config['name'])) best_iou = val_log['iou'] print("=> saved best model") trigger = 0 # early stopping if config['early_stopping'] >= 0 and trigger >= config[ 'early_stopping']: print("=> early stopping") break torch.cuda.empty_cache()