def train(model, train_set, test_set, save, valid_set, n_epochs): """ Main training function """ # Dataloaders train_loader = DataLoader( train_set, batch_size=cfg.batch_size, shuffle=True, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers ) test_loader = DataLoader( test_set, batch_size=cfg.batch_size, shuffle=False, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers ) if valid_set is None: valid_loader = None else: valid_loader = DataLoader( valid_set, batch_size=cfg.batch_size, shuffle=False, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers ) # Model on cuda model = to_device(model) # Wrap model for multi-GPUs, if necessary model_wrapper = model if torch.cuda.is_available() and torch.cuda.device_count() > 1: if cfg.use_syncbn: print("Using sync-bn") model_wrapper = DataParallelWithCallback(model).cuda() else: model_wrapper = torch.nn.DataParallel(model).cuda() # optimizer and scheduler optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=cfg.lr) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.milestones, gamma=cfg.gamma) # Start logging logs = ["loss", "iou", "dice", "iou0", "iou1", "dice0", "dice1", "dice_global"] train_logs = ["train_" + log for log in logs] test_logs = ["test_" + log for log in logs] log_dict = OrderedDict.fromkeys(train_logs + test_logs, 0) with open(os.path.join(save, "logs.csv"), "w") as f: f.write("epoch,") for key in log_dict.keys(): f.write(key + ",") f.write("\n") with open(os.path.join(save, "loss_logs.csv"), "w") as f: f.write("iter,train_loss,\n") writer = SummaryWriter(log_dir=os.path.join(save, "Tensorboard_Results")) # train and test the model best_dice_global = 0 global iteration iteration = 0 for epoch in range(n_epochs): os.makedirs(os.path.join(cfg.save, "epoch_{}".format(epoch))) print("learning rate: ", scheduler.get_lr()) # train epoch train_meters = train_epoch( model=model_wrapper, loader=train_loader, optimizer=optimizer, epoch=epoch, n_epochs=n_epochs, writer=writer ) # test epoch test_meters = test_epoch(model=model_wrapper, loader=test_loader, epoch=epoch, is_test=True, writer=writer) scheduler.step() # Log results for i, key in enumerate(train_logs): log_dict[key] = train_meters[i] for i, key in enumerate(test_logs): log_dict[key] = test_meters[i] log_results(save, epoch, log_dict, writer=writer) # save model checkpoint if cfg.save_all: torch.save(model.state_dict(), os.path.join(save, "epoch_{}".format(epoch), "model.dat")) if log_dict["test_dice_global"] > best_dice_global: torch.save(model.state_dict(), os.path.join(save, "model.dat")) best_dice_global = log_dict["test_dice_global"] print("New best global dice: %.4f" % log_dict["test_dice_global"]) else: print("Current best global dice: %.4f" % best_dice_global) # end writer.close() with open(os.path.join(save, "logs.csv"), "a") as f: f.write(",,,,best global dice,%0.5f\n" % (best_dice_global)) print("best global dice: ", best_dice_global)
def main(save_path=cfg.save_path): # back up your code backup_code(save_path) # set seed set_seed(cfg.seed) # accelaration torch.backends.cudnn.benchmark = True # Datasets train_set = LIDCSegDataset(crop_size=48, move=5, data_path=env.data, train=True) test_set = LIDCSegDataset(crop_size=48, move=5, data_path=env.data, train=False) train_loader = DataLoader(train_set, batch_size=cfg.batch_size, shuffle=True, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers) test_loader = DataLoader(test_set, batch_size=cfg.batch_size, shuffle=False, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers) # Define model model_dict = {'resnet18': FCNResNet, 'vgg16': FCNVGG, 'densenet121': FCNDenseNet} model = model_dict[cfg.backbone](pretrained=cfg.pretrained, num_classes=2, backbone=cfg.backbone) print(model) torch.save(model.state_dict(), os.path.join(save_path, 'model.dat')) # Model on cuda and then wrap model for multi-GPUs, if necessary model = to_device(model) if torch.cuda.is_available() and torch.cuda.device_count() > 1: if cfg.use_syncbn: print('Using sync-bn') model_wrapper = DataParallelWithCallback(model).cuda() else: model_wrapper = torch.nn.DataParallel(model).cuda() else: model_wrapper = model # optimizer and scheduler optimizer = getattr(torch.optim, cfg.optimizer_choice)(model_wrapper.parameters(), lr=cfg.optimizer_lr) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.scheduler_milestones, gamma=cfg.scheduler_gamma) results_logger = ResultsLogger(save_path, train_log_items=[] , test_log_items=[]) # train and test the model best_dice_global = 0 global iteration iteration = 0 for epoch in range(n_epochs): # os.makedirs(os.path.join(cfg.save, 'epoch_{}'.format(epoch))) print('learning rate: ', scheduler.get_lr()) train_results = train_epoch(model=model_wrapper, loader=train_loader, optimizer=optimizer, epoch=epoch, results_logger=results_logger) test_results = test_epoch(model=model_wrapper, loader=test_loader, epoch=epoch, results_logger=results_logger) scheduler.step() results_logger.log_epoch(train_results, test_results) # save model checkpoint if cfg.save_all: torch.save(model.state_dict(), os.path.join(save, 'epoch_{}'.format(epoch), 'model.dat')) if > best_dice_global: torch.save(model.state_dict(), os.path.join(save, 'best_model.dat')) best_dice_global = print('New best global dice: %.4f' % ) else: print('Current best global dice: %.4f' % best_dice_global) results_logger.close(best_result=best_dice_global) print('best global dice: ', best_dice_global) print('Done!')
def train(model, train_set, test_set, save, valid_set, n_epochs): ''' Main training function ''' # Dataloaders train_loader = DataLoader(train_set, batch_size=cfg.batch_size, shuffle=True, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers) test_loader = DataLoader(test_set, batch_size=cfg.batch_size, shuffle=False, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers) # modified if valid_set is None: valid_loader = None else: valid_loader = DataLoader(valid_set, batch_size=cfg.batch_size, shuffle=False, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers) # Model on cuda model = to_device(model) # Wrap model for multi-GPUs, if necessary model_wrapper = model print('num_of_cuda:', torch.cuda.device_count()) if torch.cuda.is_available() and torch.cuda.device_count() > 1: print('multi-gpus') if cfg.use_syncbn: print('Using sync-bn') model_wrapper = DataParallelWithCallback(model).cuda() else: model_wrapper = torch.nn.DataParallel(model).cuda() # optimizer and scheduler optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=cfg.lr) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.milestones, # gamma=cfg.gamma) scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=cfg.max_lr, epochs=n_epochs, steps_per_epoch=len(train_loader)) # Start logging logs = ['loss', 'acc', 'acc0', 'acc1'] train_logs = ['train_' + log for log in logs] + [ 'train_auc', ] valid_logs = ['valid_' + log for log in logs] + ['valid_auc', 'valid_auc_pat'] test_logs = ['test_' + log for log in logs] + ['test_auc', 'test_auc_pat'] log_dict = OrderedDict.fromkeys(train_logs + valid_logs + test_logs, 0) with open(os.path.join(save, 'logs.csv'), 'w') as f: f.write('epoch,') for key in log_dict.keys(): f.write(key + ',') f.write('\n') with open(os.path.join(save, 'loss_logs.csv'), 'w') as f: f.write('iter,train_loss,\n') writer = SummaryWriter(log_dir=os.path.join(save, 'Tensorboard_Results')) # train and test the model best_auc = 0 global iteration iteration = 0 for epoch in range(n_epochs): os.makedirs(os.path.join(cfg.save, 'epoch_{}'.format(epoch))) print('learning rate: ', scheduler.get_lr()) # train epoch train_meters = train_epoch(model=model_wrapper, loader=train_loader, optimizer=optimizer, scheduler=scheduler, epoch=epoch, n_epochs=n_epochs, writer=writer) # valid epoch valid_meters = test_epoch(model=model_wrapper, loader=valid_loader, epoch=epoch, is_test=False, writer=writer) # test epoch test_meters = test_epoch(model=model_wrapper, loader=test_loader, epoch=epoch, is_test=True, writer=writer) # scheduler.step() # Log results for i, key in enumerate(train_logs): log_dict[key] = train_meters[i] for i, key in enumerate(valid_logs): log_dict[key] = valid_meters[i] for i, key in enumerate(test_logs): log_dict[key] = test_meters[i] log_results(save, epoch, log_dict, writer=writer) # save model checkpoint if cfg.save_all: torch.save( model.state_dict(), os.path.join(save, 'epoch_{}'.format(epoch), 'model.dat')) if log_dict['valid_auc'] > best_auc: torch.save(model.state_dict(), os.path.join(save, 'model.dat')) best_auc = log_dict['valid_auc'] print('New best auc: %.4f' % log_dict['valid_auc']) else: print('Current best auc: %.4f' % best_auc) # end writer.close() with open(os.path.join(save, 'logs.csv'), 'a') as f: f.write(',,,,best auc,%0.5f\n' % (best_auc)) print('best auc: ', best_auc)
def train(model, test_set, save, n_epochs): ''' Main training function ''' # Dataloaders test_loader = DataLoader(test_set, batch_size=cfg.batch_size, shuffle=False, pin_memory=(torch.cuda.is_available()), num_workers=cfg.num_workers) # Model on cuda model = to_device(model) # Wrap model for multi-GPUs, if necessary model_wrapper = model print('num_of_cuda:',torch.cuda.device_count()) if torch.cuda.is_available() and torch.cuda.device_count() > 1: print('multi-gpus') if cfg.use_syncbn: print('Using sync-bn') model_wrapper = DataParallelWithCallback(model).cuda() else: model_wrapper = torch.nn.DataParallel(model).cuda() # optimizer and scheduler optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=cfg.lr) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.milestones,gamma=cfg.gamma) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=cfg.factor, patience=cfg.patience, min_lr=cfg.min_lr, eps=cfg.eps) # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=cfg.max_lr, epochs=n_epochs, steps_per_epoch=len(train_loader), # div_factor=cfg.div_factor, final_div_factor=cfg.final_div_factor) # Start logging logs = ['loss', 'acc', 'acc0', 'acc1'] test_logs = ['test_'+log for log in logs]+['test_auc','test_auc_pat'] log_dict = OrderedDict.fromkeys(test_logs, 0) with open(os.path.join(save, 'logs.csv'), 'w') as f: f.write('epoch,') for key in log_dict.keys(): f.write(key+',') f.write('\n') with open(os.path.join(save, 'loss_logs.csv'), 'w') as f: f.write('iter,train_loss,\n') writer = SummaryWriter(log_dir=os.path.join(save, 'Tensorboard_Results')) # train and test the model best_auc = 0 global iteration iteration = 0 for epoch in range(1): print('learning rate: ', optimizer.state_dict()['param_groups'][0]['lr']) # test epoch test_meters = test_epoch( model=model_wrapper, loader=test_loader, epoch=epoch, is_test=True, writer = writer ) # Log results for i, key in enumerate(test_logs): log_dict[key] = test_meters[i] log_results(save, epoch, log_dict, writer=writer) # save model checkpoint # if cfg.save_all: # end writer.close()