def train_model(training_loader): global epoch_list global learning_rate_list if train_from_last_model: model = load_model() learning_rate_list, epoch_list = load_info() print("Load the exist model and continue") else: model = SegNet(input_channel,output_channel).cuda() print("Train a new model") optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) for epoch in range(NUM_EPOCHS): t_start = time.time() # i = 0 count_batch = 0 #count loss loss_sum = 0 epoch_list.append(epoch+1) # x_axis_list = [] #graph # y_axis_list = [] #graph # tqdm.write('epoch = {}'.format(epoch+1)) for i, batch in enumerate(tqdm(training_loader)): # load data input_tensor = Variable(batch['camera_5']).cuda() target_tensor = Variable(batch['fg_mask']).cuda() predicted_tensor, softmaxed_tensor = model(input_tensor) criterion = torch.nn.CrossEntropyLoss().cuda() #criterion = nn.BCEWithLogitsLoss().cuda() optimizer.zero_grad() loss = criterion(predicted_tensor, target_tensor) #print('loss = {}'.format(loss)) loss.backward() optimizer.step() loss_sum += loss count_batch += 1 torch.cuda.empty_cache() # tqdm().clear() average_loss = loss_sum / count_batch learning_rate_list.append(float(average_loss)) tqdm.write('{} epoch: loss = {}'.format(epoch+1,average_loss)) plot_learning_curve(len(epoch_list)) save_model(model) save_info() return model
def main(_run, _config, world_size, rank, init_method, datadir, batch_size, num_workers, outdir_suffix, outdir, lr, wd, warmup, num_epochs, nsamples): cudnn.benchmark = True device = torch.device('cuda:0') # device is set by CUDA_VISIBLE_DEVICES torch.cuda.set_device(device) # rank 0 creates experiment observer is_master = rank == 0 # rank joins process group print('rank', rank, 'init_method', init_method) dist.init_process_group('nccl', rank=rank, world_size=world_size, init_method=init_method) # actual training stuff train = make_loader( pt.join(datadir, '') if datadir else None, batch_size, device, world_size, rank, num_workers, # this the parameter based on which augmentation is applied to the data gpu_augmentation=False, image_rng=None, nsamples=nsamples) # lr is scaled linearly to original batch size of 256 # world_batch_size = world_size * batch_size # k = world_batch_size / 256 # lr = k * lr # outdir stuff if outdir is None: outdir = pt.join(ROOT, '../exp/', outdir_suffix) model = Net(num_classes=500, batch_size=batch_size) print('\n network parameters ', len(list(model.parameters()))) model = model.to(device) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) #model = Unpacker(model) optimizer, policy = make_policy(num_epochs, model, lr, 0.9, wd) # loss for autoencoder loss = L1Loss(output_key='output', target_key='target_image').to(device) # this loss is for classifier classifier_loss = CrossEntropyLoss(output_key='probs', target_key='label').to(device) trainer = Trainer(model, optimizer, loss, classifier_loss, rank, AccuracyMetric(output_key='softmax_output', target_key='label'), policy, None, train, None, outdir, snapshot_interval=4 if is_master else None, quiet=True if not is_master else False) print('\n Number of epochs are: ', num_epochs) start = datetime.now() with train: trainer.train(num_epochs, start_epoch=0) print("Training complete in: " + str(datetime.now() - start))
def main(): global args, best_prec1 args = parser.parse_args() # Check if the save directory exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model = SegNet(3, 3) #model.features = torch.nn.DataParallel(model.features) if use_gpu: model.cuda() # Optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) #optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # data_transforms = { # 'train': transforms.Compose([ # transforms.Scale(256), # transforms.RandomSizedCrop(224), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]), # ]), # 'val': transforms.Compose([ # transforms.Scale(256), # transforms.CenterCrop(224), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]), # ]), # } data_transforms = { 'train': transforms.Compose([ transforms.Scale((224, 224)), transforms.ToTensor(), ]), 'val': transforms.Compose([ transforms.Scale((224, 224)), transforms.ToTensor(), ]), } data_dir = '/media/salman/DATA/NUST/MS RIME/Thesis/MICCAI Dataset/miccai_all_images' image_datasets = { x: miccaiDataset(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val'] } dataloaders = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=args.batch_size, shuffle=True, num_workers=args.workers) for x in ['train', 'val'] } dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} # Define loss function (criterion) and optimizer criterion = nn.MSELoss().cuda() if args.half: model.half() criterion.half() #optimizer = torch.optim.SGD(model.parameters(), args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) if args.evaluate: validate(dataloaders['val'], model, criterion) return for epoch in range(args.start_epoch, args.epochs): #adjust_learning_rate(optimizer, epoch) # Train for one epoch train(dataloaders['train'], model, criterion, optimizer, epoch) # Evaulate on validation set prec1 = validate(dataloaders['val'], model, criterion) prec1 = prec1.cpu().data.numpy() # Remember best prec1 and save checkpoint print(prec1) print(best_prec1) is_best = prec1 < best_prec1 best_prec1 = min(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, #'optimizer': optimizer.state_dict(), }, is_best, filename=os.path.join(args.save_dir, 'checkpoint_{}.tar'.format(epoch)))
def handler(context): # Dataset dataset_alias = context.datasets train_dataset_id = dataset_alias['train'] val_dataset_id = dataset_alias['val'] trainset = SegmentationDatasetFromAPI(train_dataset_id, transform=SegNetAugmentation(MEANS)) valset = SegmentationDatasetFromAPI(val_dataset_id, transform=SegNetAugmentation( MEANS, False)) class_weight = calc_weight( SegmentationDatasetFromAPI(train_dataset_id, transform=SegNetAugmentation(MEANS, False))) class_weight = class_weight.to(device) trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCHSIZE, shuffle=True, num_workers=0) valloader = torch.utils.data.DataLoader(valset, batch_size=BATCHSIZE, shuffle=False, num_workers=0) # Model net = SegNet(3, n_class=len(camvid_label_names)) net = net.to(device) # Optimizer #criterion = PixelwiseSoftmaxClassifier(weight=class_weight) criterion = torch.nn.CrossEntropyLoss(weight=class_weight, ignore_index=-1) optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[150, 250], gamma=0.1) statistics = Statistics(epochs) for epoch in range(epochs): scheduler.step() train_loss, train_acc = train(net, optimizer, trainloader, criterion, epoch) test_loss, test_acc = test(net, valloader, criterion, epoch) # Reporting print( '[{:d}] main/loss: {:.3f} main/acc: {:.3f}, main/validation/loss: {:.3f}, main/validation/acc: {:.3f}' .format(epoch + 1, train_loss, train_acc, test_loss, test_acc)) statistics(epoch + 1, train_loss, train_acc, test_loss, test_acc) writer.add_scalar('main/loss', train_loss, epoch + 1) writer.add_scalar('main/acc', train_acc, epoch + 1) writer.add_scalar('main/validation/loss', test_loss, epoch + 1) writer.add_scalar('main/validation/acc', test_acc, epoch + 1) torch.save(net.state_dict(), os.path.join(ABEJA_TRAINING_RESULT_DIR, 'model.pth'))
num_workers=hparams.num_workers, pin_memory=True, ) model = SegNet(11, 3, drop_rate=0.2) writer.add_graph(model, torch.zeros(1, 3, 360, 480)) model = model.to(device) cls_w = class_weights(train_dataset).astype(np.float32) cls_w = torch.from_numpy(cls_w[:-1]) criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=11, weight=cls_w).to(device) # optimizer = optim.Adam( optimizer = optim.SGD( model.parameters(), lr=float(hparams.init_lr), weight_decay=float(hparams.weight_decay), momentum=0.9, ) model, optimizer = amp.initialize(model, optimizer, opt_level=hparams.opt_lv, verbosity=1) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=hparams.step_size, gamma=hparams.step_down_rate) best_acc = 0.0 pbar = tqdm(range(hparams.epochs))