def test(net, testloader, device): running_loss = 0 net.eval() with torch.no_grad(): for ii, sample_batched in enumerate(tqdm(testloader, leave=False)): inputs = sample_batched['concat'].to(device) gts = sample_batched['crop_gt'].to(device) # Forward pass of the mini-batch output = net.forward(inputs) output = upsample(output, size=(512, 512), mode='bilinear', align_corners=True) # Compute the losses, side outputs and fuse loss = class_balanced_cross_entropy_loss(output, gts, size_average=False) running_loss += loss.item() return running_loss / len(testloader)
net.train() for ii, sample_batched in enumerate(trainloader): import ipdb; ipdb.set_trace() # breakpoint 5ab142bd // inputs, gts = sample_batched['concat'], sample_batched['crop_gt'] # Forward-Backward of the mini-batch inputs.requires_grad_() inputs, gts = inputs.to(device), gts.to(device) output = net.forward(inputs) output = upsample(output, size=(512, 512), mode='bilinear', align_corners=True) # Compute the losses, side outputs and fuse loss = class_balanced_cross_entropy_loss(output, gts, size_average=False, batch_average=True) running_loss_tr += loss.item() # Print stuff if ii % num_img_tr == num_img_tr - 1: running_loss_tr = running_loss_tr / num_img_tr writer.add_scalar('data/total_loss_epoch', running_loss_tr, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, ii*p['trainBatch']+inputs.data.shape[0])) print('Loss: %f' % running_loss_tr) running_loss_tr = 0 stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time)+"\n") # Backward the averaged gradient loss /= p['nAveGrad'] loss.backward()
def train(model_name, gpu_id, learning_rate): # Set gpu_id to -1 to run in CPU mode, otherwise set the id of the corresponding gpu device = torch.device("cuda:%d" % gpu_id if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): tqdm.write('Using GPU: {} '.format(gpu_id)) # Setting parameters use_sbd = False nEpochs = 100 # Number of epochs for training resume_epoch = 0 # Default is 0, change if want to resume p = OrderedDict() # Parameters to include in report classifier = 'psp' # Head classifier to use p['trainBatch'] = 5 # Training batch size testBatch = 5 # Testing batch size useTest = True # See evolution of the test set when training? nTestInterval = 10 # Run on test set every nTestInterval epochs snapshot = 10 # Store a model every snapshot epochs relax_crop = 50 # Enlarge the bounding box by relax_crop pixels nInputChannels = 4 # Number of input channels (RGB + heatmap of extreme points) zero_pad_crop = True # Insert zero padding when cropping the image p['nAveGrad'] = 1 # Average the gradient of several iterations p['lr'] = learning_rate #1e-4 # Learning rate p['wd'] = 0.0005 # Weight decay p['momentum'] = 0.9 # Momentum # Results and model directories (a new directory is generated for every run) package_path = Path(__file__).resolve() folder_name = 'runs-{}-{:f}'.format(model_name, learning_rate) save_dir_root = package_path.parent / 'RUNS' exp_name = model_name save_dir = save_dir_root / folder_name (save_dir / 'models').mkdir(parents=True, exist_ok=True) with (save_dir / 'log.csv').open('wt') as csv: csv.write('train_loss,test_loss\n') tqdm.write(str(save_dir)) # Network definition modelName = model_name net = load_model(model_name, nInputChannels) #if resume_epoch == 0: # print("Initializing from pretrained Deeplab-v2 model") #else: # weights_path = save_dir / 'models' # weights_path /= '%s_epoch-%d.pth' % (modelName, resume_epoch-1) # print("Initializing weights from: ", weights_path) # net.load_state_dict(torch.load(weights_path, # map_location=lambda s, _: s)) train_params = [{'params': net.parameters(), 'lr': p['lr']}] net.to(device) # Training the network if resume_epoch != nEpochs: # Logging into Tensorboard time_now = datetime.now().strftime('%b%d_%H-%M-%S') hostname = socket.gethostname() log_dir = save_dir / 'models' / '{}_{}'.format(time_now, hostname) writer = SummaryWriter(log_dir=str(log_dir)) # Use the following optimizer #optimizer = optim.SGD(train_params, lr=p['lr'], momentum=p['momentum'], weight_decay=p['wd']) optimizer = optim.Adam(train_params, lr=p['lr'], weight_decay=p['wd']) p['optimizer'] = str(optimizer) # Preparation of the data loaders train_tf, test_tf = create_transforms(relax_crop, zero_pad_crop) voc_train = pascal.VOCSegmentation(split='train', download=True, transform=train_tf) voc_val = pascal.VOCSegmentation(split='val', download=True, transform=test_tf) if use_sbd: sbd = sbd.SBDSegmentation(split=['train', 'val'], retname=True, transform=train_tf) db_train = combine_dbs([voc_train, sbd], excluded=[voc_val]) else: db_train = voc_train p['dataset_train'] = str(db_train) p['transformations_train'] = [str(t) for t in train_tf.transforms] p['dataset_test'] = str(db_train) p['transformations_test'] = [str(t) for t in test_tf.transforms] trainloader = DataLoader(db_train, batch_size=p['trainBatch'], shuffle=True, num_workers=2) testloader = DataLoader(voc_val, batch_size=testBatch, shuffle=False, num_workers=2) generate_param_report((save_dir / exp_name).with_suffix('.txt'), p) # Train variables num_img_tr = len(trainloader) num_img_ts = len(testloader) running_loss_tr = 0.0 running_loss_ts = 0.0 aveGrad = 0 #print("Training Network") # Main Training and Testing Loop for epoch in trange(resume_epoch, nEpochs): start_time = timeit.default_timer() net.train() for ii, sample_batched in enumerate(tqdm(trainloader, leave=False)): inputs = sample_batched['concat'].to(device) gts = sample_batched['crop_gt'].to(device) # Forward-Backward of the mini-batch inputs.requires_grad_() output = net.forward(inputs) #.cpu() #print(output.shape) #exit() output = interpolate(output, size=(512, 512), mode='bilinear', align_corners=True).to(device) # Compute the losses, side outputs and fuse loss = class_balanced_cross_entropy_loss(output, gts, size_average=False, batch_average=True) running_loss_tr += loss.item() #print(loss.item()) # Backward the averaged gradient loss /= p['nAveGrad'] loss.backward() aveGrad += 1 # Update the weights once in p['nAveGrad'] forward passes if aveGrad % p['nAveGrad'] == 0: writer.add_scalar('data/total_loss_iter', loss.item(), ii + num_img_tr * epoch) optimizer.step() optimizer.zero_grad() aveGrad = 0 # Save the model if (epoch + 1) % snapshot == 0: weights_path = save_dir / 'models' weights_path /= '{}_epoch-{:d}.pth'.format(modelName, epoch) torch.save(net.state_dict(), weights_path) # One testing epoch if useTest and (epoch + 1) % nTestInterval == 0: msg = 'Test Loss: {:.3f}' test_loss = test(net, testloader, device) tqdm.write(msg.format(test_loss)) running_loss_tr = running_loss_tr / num_img_tr with (save_dir / 'log.csv').open('at') as csv: csv.write(','.join([str(running_loss_tr), str(test_loss)])) csv.write('\n') writer.add_scalar('data/total_loss_epoch', running_loss_tr, epoch) num_images = ii * p['trainBatch'] + inputs.data.shape[0] msg = '[Epoch: {:d}, numImages: {:5d}]' tqdm.write(msg.format(epoch, num_images)) tqdm.write('Loss: %f' % running_loss_tr) running_loss_tr = 0 stop_time = timeit.default_timer() msg = "Execution time: {:.3f}" tqdm.write(msg.format(stop_time - start_time)) writer.close()