def test(sess, dataset): with sess.as_default(): input_image = sess.graph.get_tensor_by_name('Image:0') decision_out = sess.graph.get_tensor_by_name('decision_out:0') mask_out = sess.graph.get_tensor_by_name('mask_out:0') DataManager = dataset num_step = 0.0 accuracy = 0.0 false_account = 0 iouGen = iouEval() for batch in range(DataManager.number_batch): img_batch, mask_batch, label_batch, _ = sess.run( DataManager.next_batch) start = timer() b, decision = sess.run([mask_out, decision_out], feed_dict={input_image: img_batch}) iouGen.addBatch(mask_batch, b) print(decision) end = timer() if decision > 0.5: false_account += 1 # print(end-start) if decision[0][0] >= 0.5 and label_batch[0][0] == 1: step_accuracy = 1 elif decision[0][0] < 0.5 and label_batch[0][0] == 0: step_accuracy = 1 else: step_accuracy = 0 accuracy = accuracy + step_accuracy num_step = num_step + 1 cv2.imwrite('visualization/{}.png'.format(str(batch)), np.squeeze(b, axis=(0, -1)) * 255) iou = iouGen.getIoU() accuracy /= num_step print("accuracy: {} iou:{}".format(accuracy, iou)) return accuracy
def valid_segmentation(self): with self.sess.as_default(): self.logger.info('start validing segmentation') print('start validing segmentation') DataManager = self.DataManager_valid total_loss = 0.0 num_step = 0.0 accuracy = 0.0 valIoU = iouEval(self.__Param["batch_size"]) for batch in range(DataManager.number_batch): img_batch, mask_batch, label_batch, _ = self.sess.run( DataManager.next_batch) a, b, total_loss_value_batch = self.sess.run( [ self.model.mask, self.model.mask_out, self.model.segmentation_loss ], feed_dict={ self.model.Image: img_batch, self.model.mask: mask_batch, self.model.label: label_batch, self.model.is_training_seg: TRAIN_MODE_IN_VALID, self.model.is_training_dec: TRAIN_MODE_IN_VALID }) # self.visualization(img_batch, label_pixel_batch,mask_batch, file_name_batch,save_dir=visualization_dir) valIoU.addBatch(a, b) # total_loss = (total_loss*(num_step)+ total_loss_value_batch)/(num_step+1) num_step = num_step + 1 total_loss += total_loss_value_batch total_loss /= num_step self.logger.info(" validation loss = {}".format(total_loss)) val_iou = valIoU.getIoU() return total_loss, val_iou
def train(args, model, enc=False): best_acc = 0 #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) if (enc): weight[0] = 2.3653597831726 weight[1] = 4.4237880706787 weight[2] = 2.9691488742828 weight[3] = 5.3442072868347 weight[4] = 5.2983593940735 weight[5] = 5.2275490760803 weight[6] = 5.4394111633301 weight[7] = 5.3659925460815 weight[8] = 3.4170460700989 weight[9] = 5.2414722442627 weight[10] = 4.7376127243042 weight[11] = 5.2286224365234 weight[12] = 5.455126285553 weight[13] = 4.3019247055054 weight[14] = 5.4264230728149 weight[15] = 5.4331531524658 weight[16] = 5.433765411377 weight[17] = 5.4631009101868 weight[18] = 5.3947434425354 else: weight[0] = 2.8149201869965 weight[1] = 6.9850029945374 weight[2] = 3.7890393733978 weight[3] = 9.9428062438965 weight[4] = 9.7702074050903 weight[5] = 9.5110931396484 weight[6] = 10.311357498169 weight[7] = 10.026463508606 weight[8] = 4.6323022842407 weight[9] = 9.5608062744141 weight[10] = 7.8698215484619 weight[11] = 9.5168733596802 weight[12] = 10.373730659485 weight[13] = 6.6616044044495 weight[14] = 10.260489463806 weight[15] = 10.287888526917 weight[16] = 10.289801597595 weight[17] = 10.405355453491 weight[18] = 10.138095855713 weight[19] = 0 assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024) dataset_train = cityscapes(args.datadir, co_transform, 'train',50) dataset_val = cityscapes(args.datadir, co_transform_val, 'val',100) print(len(dataset_train)) loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) # print(list(enumerate(loader))) if args.cuda: weight = weight.cuda() criterion = CrossEntropyLoss2d(weight) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() #print("this is me!!!!!") #print(len(loader)) for step, (images, labels) in enumerate(loader): start_time = time.time() #print("this is also m") #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) #print("This is me on traget") #print(np.min(targets.cpu().detach().numpy())) #print("This is me after target") optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) #print("This is me on loss") #print(loss) #print("This is me after loss") loss.backward() optimizer.step() epoch_loss.append(loss.cpu().detach().numpy().item()) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.cpu().detach().numpy().item()) time_val.append(time.time() - start_time) #Add batch to calculate TP, FP and FN for iou estimation if (doIouVal): #start_time_iou = time.time() iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data board.image(image, f'VAL input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'VAL target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model) #return model (convenience for encoder-decoder training)
def main(): best_acc = 0 co_transform = MyCoTransform(ENCODER_ONLY, augment=True, height=IMAGE_HEIGHT) co_transform_val = MyCoTransform(ENCODER_ONLY, augment=False, height=IMAGE_HEIGHT) #train data dataset_train = idd_lite(DATA_ROOT, co_transform, 'train') print("length of training set: ", len(dataset_train)) #test data dataset_val = idd_lite(DATA_ROOT, co_transform_val, 'val') print("length of validation set: ", len(dataset_val)) # NOTE: PLEASE DON'T CHANGE batch_size and num_workers here. We have limited resources. loader_train = DataLoader(dataset_train, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE, shuffle=True) dataiter = iter(loader_val) seven_val_images = [] for i in range(7): (val_image_A, val_image_B, val_image_labels) = dataiter.next() seven_val_images.append( (val_image_A.to(device), val_image_B.to(device))) cv2.imwrite( os.path.join(OUTPUT_DIR, str(i), 'A.tiff'), np.rollaxis((val_image_A[0, :, :, :].squeeze().cpu().numpy() * 255).astype('uint8'), 0, 3)) cv2.imwrite( os.path.join(OUTPUT_DIR, str(i), 'B.tiff'), np.rollaxis((val_image_B[0, :, :, :].squeeze().cpu().numpy() * 255).astype('uint8'), 0, 3)) cv2.imwrite(os.path.join(OUTPUT_DIR, str(i), 'label.tiff'), (val_image_labels[0, :, :, :].squeeze().cpu().numpy() ).astype('uint8')) # ## Cross Entropy Loss ## # Negative Log Loss |Plot of -log(x) vs x # - | - # ![alt](img/nll.png) | ![alt](img/nll-log.png) # # The negative log-likelihood becomes unhappy at smaller values, where it can reach infinite unhappiness (that’s too sad), and becomes less unhappy at larger values. Because we are summing the loss function to all the correct classes, what’s actually happening is that whenever the network assigns high confidence at the correct class, the unhappiness is low, but when the network assigns low confidence at the correct class, the unhappiness is high. # In[12]: criterion = torch.nn.CrossEntropyLoss() #get some random training images print("length of training couples: ", len(loader_train)) print(len(loader_val)) dataiter = iter(loader_train) (images, images1, labels, filename) = dataiter.next() #ChangedByUs # for step, (images, labels) in enumerate(loader_train): # plt.figure() # plt.imshow(ToPILImage()(images[0].cpu())) # plt.figure() # plt.imshow(ToPILImage()(Colorize()(labels[0].cpu()))) # break # ## Model ## model_file = importlib.import_module('erfnet') model = model_file.Net(NUM_CLASSES).to(device) # ### Optimizer ### # We use adam optimizer. It can be replaced with SGD and other optimizers optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) start_epoch = 1 print("device used: ", device) # ### Training Procedure ### softmax = torch.nn.Softmax(dim=1) steps_loss = 50 my_start_time = time.time() for epoch in range(start_epoch, NUM_EPOCHS + 1): print("----- TRAINING - EPOCH", epoch, "-----") epoch_loss = [] time_train = [] doIouTrain = IOUTRAIN doIouVal = IOUVAL if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) model.train() for step, (images, images1, labels, filename) in enumerate(loader_train): #ChangedByUs start_time = time.time() # inputs = [images.to(device), images1.to(device)] #ChangedByUs inputs = images.to(device) inputs1 = images1.to(device) #ChangedByUs targets = labels.to(device) targets_orig = targets.clone() targets[targets_orig >= 128] = 1 # ChangedByUs targets[targets_orig < 128] = 0 # ChangedByUs #for x_u in targets.unique(): # print(int(x_u), ' appears ', int(torch.stack([(targets==x_u).sum()])), ' times.\n') outputs = model([inputs, inputs1], only_encode=ENCODER_ONLY) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss.item()) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) # print statistics if steps_loss > 0 and step % steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( 'loss: {average:', average, '} (epoch: {', epoch, '}, step: {', step, '})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / BATCH_SIZE)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format( iouTrain * 100) + '\033[0m' print("EPOCH IoU on TRAIN set: ", iouStr, "%") #save one image per epoch # if USE_CUDA: # first_val_image_A = first_val_image_A.to(device) # first_val_image_B = first_val_image_B.to(device) # ChangedByUs # first_val_image_labels = first_val_image_labels.to(device) # # inputs = first_val_image_A.to(device) # inputs1 = first_val_image_B.to(device) # ChangedByUs for i in range(len(seven_val_images)): outputs_val = model( [seven_val_images[i][0].cuda(), seven_val_images[i][1].cuda()], only_encode=ENCODER_ONLY) outputs_val = softmax(outputs_val) cv2.imwrite( os.path.join(OUTPUT_DIR, str(i), 'epoch' + str(epoch) + '_output.tiff'), (((outputs_val[0, 1, :, :] > 0.5) * 255).squeeze().cpu().numpy()).astype('uint8')) my_end_time = time.time() print(my_end_time - my_start_time) print( 'loss: {average:', average, '} (epoch: {', epoch, '}, step: {', step, '})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / BATCH_SIZE)) # # ### Validation ### # #Validate on val images after each epoch of training # print("----- VALIDATING - EPOCH", epoch, "-----") # model.eval() # epoch_loss_val = [] # time_val = [] # # if (doIouVal): # iouEvalVal = iouEval(NUM_CLASSES) # # for step, (images, labels) in enumerate(loader_val): # start_time = time.time() # # inputs = images.to(device) # targets = labels.to(device) # # with torch.no_grad(): # outputs = model(inputs, only_encode=ENCODER_ONLY) # #outputs = model(inputs) # loss = criterion(outputs, targets[:, 0]) # epoch_loss_val.append(loss.item()) # time_val.append(time.time() - start_time) # # # #Add batch to calculate TP, FP and FN for iou estimation # if (doIouVal): # #start_time_iou = time.time() # iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) # #print ("Time to add confusion matrix: ", time.time() - start_time_iou) # # if steps_loss > 0 and step % steps_loss == 0: # average = sum(epoch_loss_val) / len(epoch_loss_val) # print('VAL loss: {average:',average,'} (epoch: {',epoch,'}, step: {',step,'})', # "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / BATCH_SIZE)) # # # average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) # # iouVal = 0 # if (doIouVal): # # iouVal, iou_classes = iouEvalVal.getIoU() # print(iou_classes) # iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' # print ("EPOCH IoU on VAL set: ", iouStr, "%") # # ### Visualizing the Output### torch.save(model.state_dict(), r'C:\Users\inbal.tlgip\modelsave.pt') # Qualitative Analysis ##################### calc iou on test data ##################### dataset_test = idd_lite(DATA_ROOT, co_transform_val, 'test') loader_test = DataLoader(dataset_test, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE, shuffle=True) # dataiter = iter(loader_test) # (val_image_A, val_image_B, val_image_labels) = dataiter.next() for step, (images, images1, labels, filename) in enumerate(loader_test): outputs_val = model([images.cuda(), images1.cuda()], only_encode=ENCODER_ONLY) outputs_val = softmax(outputs_val) cv2.imwrite( r'D:\Users Data\inbal.tlgip\Project\output_images\test_output/' + str(step) + '.tiff', (((outputs_val[0, 1, :, :] > 0.5) * 255).squeeze().cpu().numpy()).astype('uint8'))
def train_segmentation(self): with self.sess.as_default(): self.logger.info('start training segmentation net') print('Start training for {} epoches, {} steps per epoch'.format( self.__Param["epochs_num"], self.DataManager_train.number_batch)) best_loss = 10000 for i in range(self.model.step, self.__Param["epochs_num"] + self.model.step): trainIoU = iouEval(self.__Param["batch_size"]) print('Epoch {}:'.format(i)) with tqdm(total=self.DataManager_train.number_batch) as pbar: # epoch start iter_loss = 0.0 num_step = 0.0 accuracy = 0.0 for batch in range(self.DataManager_train.number_batch): # run_options = tf.RunOptions() # run_metadata = tf.RunMetadata() # batch start # print(self.sess.run( # self.sess.graph.get_tensor_by_name('segmentation/MixnetBlock_0_bn1/moving_mean:0'))) # print(self.sess.run( # self.sess.graph.get_tensor_by_name('segmentation/MixnetBlock_0_bn1/moving_variance:0'))) img_batch, mask_batch, label_batch, _ = self.sess.run( self.DataManager_train.next_batch) a, b, _, loss_value_batch = self.sess.run( [ self.model.mask, self.model.mask_out, self.model.optimize_segment, self.model.segmentation_loss ], # self.model.merged], feed_dict={ self.model.Image: img_batch, self.model.mask: mask_batch, self.model.label: label_batch, self.model.is_training_seg: TRAIN_MODE_IN_TRAIN, self.model.is_training_dec: False }) # options=run_options, # run_metadata=run_metadata) trainIoU.addBatch(a, b) # self.model.train_writer.add_run_metadata(run_metadata, 'step%03d' % batch) # iter_loss = (iter_loss*(num_step)+ loss_value_batch)/(num_step+1) iter_loss += loss_value_batch num_step = num_step + 1 pbar.update(1) # self.model.train_writer.add_summary(summary, batch) pbar.close() iter_loss /= num_step iou = trainIoU.getIoU() self.logger.info( 'epoch:[{}] ,train_mode, loss: {}, accuracy: {}'.format( self.model.step, iter_loss, accuracy)) # 验证 self.model.step += 1 # if i % self.__Param["valid_frequency"] == 0 and i>0: train_iou = trainIoU.getIoU() val_loss, val_iou = self.valid_segmentation() print('train_loss:{}, train_iou:{}, val_loss:{}, val_iou:{}'. format(iter_loss, train_iou, val_loss, val_iou)) # 保存模型 if i % self.__Param["save_frequency"] == 0 or i == self.__Param[ "epochs_num"] + self.model.step - 1: # if val_loss < best_loss: # best_loss = val_loss # print('reduce loss to {}, saving model at epoch:{}'.format(val_loss, i)) self.model.save()
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) model = ERFNet(NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: print(name, " not loaded") continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, target_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) iouEvalVal = iouEval(NUM_CLASSES) start = time.time() for step, (images, labels, filename, filenameGt) in enumerate(loader): if (not args.cpu): images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) outputs = model(inputs) iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, labels) filenameSave = filename[0].split("leftImg8bit/")[1] print(step, filenameSave) iouVal, iou_classes = iouEvalVal.getIoU() iou_classes_str = [] for i in range(iou_classes.size(0)): iouStr = getColorEntry(iou_classes[i]) + '{:0.2f}'.format( iou_classes[i] * 100) + '\033[0m' iou_classes_str.append(iouStr) print("---------------------------------------") print("Took ", time.time() - start, "seconds") print("=======================================") #print("TOTAL IOU: ", iou * 100, "%") print("Per-Class IoU:") print(iou_classes_str[0], "Road") print(iou_classes_str[1], "sidewalk") print(iou_classes_str[2], "building") print(iou_classes_str[3], "wall") print(iou_classes_str[4], "fence") print(iou_classes_str[5], "pole") print(iou_classes_str[6], "traffic light") print(iou_classes_str[7], "traffic sign") print(iou_classes_str[8], "vegetation") print(iou_classes_str[9], "terrain") print(iou_classes_str[10], "sky") print(iou_classes_str[11], "person") print(iou_classes_str[12], "rider") print(iou_classes_str[13], "car") print(iou_classes_str[14], "truck") print(iou_classes_str[15], "bus") print(iou_classes_str[16], "train") print(iou_classes_str[17], "motorcycle") print(iou_classes_str[18], "bicycle") print("=======================================") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(iouVal * 100) + '\033[0m' print("MEAN IoU: ", iouStr, "%")
def train(args, model, enc=False): best_acc = 0 #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) if (enc): weight[0] = 4.38133159 weight[1] = 1.29574148 else: weight[0] = 4.40513628 weight[1] = 1.293674 if (enc): up = torch.nn.Upsample(scale_factor=16, mode='bilinear') else: up = torch.nn.Upsample(scale_factor=2, mode='bilinear') if args.cuda: up = up.cuda() assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() if args.weighted: criterion = CrossEntropyLoss2d(weight) else: criterion = CrossEntropyLoss2d() print(type(criterion)) savedir = args.savedir if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES, args.ignoreindex) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels, images_orig, labels_orig) in enumerate(loader): start_time = time.time() #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() upsampledOutputs = up(outputs) iouEvalTrain.addBatch(upsampledOutputs.max(1)[1].unsqueeze(1).data, labels_orig) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%", iou_classes) #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES, args.ignoreindex) for step, (images, labels, images_orig, labels_orig) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) #Add batch to calculate TP, FP and FN for iou estimation if (doIouVal): #start_time_iou = time.time() upsampledOutputs = up(outputs) iouEvalVal.addBatch(upsampledOutputs.max(1)[1].unsqueeze(1).data, labels_orig) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data board.image(image, f'VAL input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'VAL target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%", iou_classes) # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model) #return model (convenience for encoder-decoder training)
def train(args, model, enc=False): global best_acc weight = torch.ones(NUM_CLASSES) weight[0] = 121.21 weight[1] = 947.02 weight[2] = 151.92 weight[3] = 428.31 weight[4] = 25.88 weight[5] = 235.97 weight[6] = 885.72 weight[7] = 911.87 weight[8] = 307.49 weight[9] = 204.69 weight[10] = 813.92 weight[11] = 5.83 weight[12] = 34.22 weight[13] = 453.34 weight[14] = 346.10 weight[15] = 250.19 weight[16] = 119.99 weight[17] = 75.28 weight[18] = 76.71 weight[19] = 8.58 weight[20] = 281.68 weight[21] = 924.07 weight[22] = 3.91 weight[23] = 7.14 weight[24] = 88.89 weight[25] = 59.00 weight[26] = 126.59 weight[27] = 0 assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height) #1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height) #1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: #criterion =LovaszLoss2d() #criterion = CrossEntropyLoss2d(weight.cuda()) criterion = FocalLoss2d(weight.cuda()) else: #criterion = LovaszLoss2d() #criterion = CrossEntropyLoss2d(weight) criterion = FocalLoss2d(weight.cuda()) print(type(criterion)) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 1e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 time_train_perepoch = [] for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") start_time_perepoch = time.time() scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels) in enumerate(loader): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() #inputs = images #targets= labels inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) #loss = criterion(outputs, targets) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) #evalIoU.printConfMatrix(confMatrix, evalIoU.args) time_train_perepoch.append(time.time() - start_time_perepoch) print("// Time per epoch: %.4f hours" % (sum(time_train_perepoch) / len(time_train_perepoch) / 3600.0)) #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) with torch.no_grad(): for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() #inputs =images #targets=labels inputs = Variable( images, requires_grad=False ) #, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, requires_grad=False) #, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed # Calculate IOU scores on class level from matrix iouVal = 0 iouTrain = 0 if (doIouVal): iouVal, iou_classes, accVal, acc_classes = iouEvalVal.getIoU() print("pole : %.6f" % (iou_classes[0] * 100.0), "%\t") print("slight : %.6f" % (iou_classes[1] * 100.0), "%\t") print("bboard : %.6f" % (iou_classes[2] * 100.0), "%\t") print("tlight : %.6f" % (iou_classes[3] * 100.0), "%\t") print("car : %.6f" % (iou_classes[4] * 100.0), "%\t") print("truck : %.6f" % (iou_classes[5] * 100.0), "%\t") print("bicycle : %.6f" % (iou_classes[6] * 100.0), "%\t") print("motor : %.6f" % (iou_classes[7] * 100.0), "%\t") print("bus : %.6f" % (iou_classes[8] * 100.0), "%\t") print("tsignf : %.6f" % (iou_classes[9] * 100.0), "%\t") print("tsignb : %.6f" % (iou_classes[10] * 100.0), "%\t") print("road : %.6f" % (iou_classes[11] * 100.0), "%\t") print("sidewalk: %.6f" % (iou_classes[12] * 100.0), "%\t") print("curbcut : %.6f" % (iou_classes[13] * 100.0), "%\t") print("crosspln: %.6f" % (iou_classes[14] * 100.0), "%\t") print("bikelane: %.6f" % (iou_classes[15] * 100.0), "%\t") print("curb : %.6f" % (iou_classes[16] * 100.0), "%\t") print("fence : %.6f" % (iou_classes[17] * 100.0), "%\t") print("wall : %.6f" % (iou_classes[18] * 100.0), "%\t") print("building: %.6f" % (iou_classes[19] * 100.0), "%\t") print("person : %.6f" % (iou_classes[20] * 100.0), "%\t") print("rider : %.6f" % (iou_classes[21] * 100.0), "%\t") print("sky : %.6f" % (iou_classes[22] * 100.0), "%\t") print("vege : %.6f" % (iou_classes[23] * 100.0), "%\t") print("terrain : %.6f" % (iou_classes[24] * 100.0), "%\t") print("markings: %.6f" % (iou_classes[25] * 100.0), "%\t") print("crosszeb: %.6f" % (iou_classes[26] * 100.0), "%\t") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print("EPOCH IoU on VAL set: ", iouStr, "%") print("pole : %.6f" % (acc_classes[0] * 100.0), "%\t") print("slight : %.6f" % (acc_classes[1] * 100.0), "%\t") print("bboard : %.6f" % (acc_classes[2] * 100.0), "%\t") print("tlight : %.6f" % (acc_classes[3] * 100.0), "%\t") print("car : %.6f" % (acc_classes[4] * 100.0), "%\t") print("truck : %.6f" % (acc_classes[5] * 100.0), "%\t") print("bicycle : %.6f" % (acc_classes[6] * 100.0), "%\t") print("motor : %.6f" % (acc_classes[7] * 100.0), "%\t") print("bus : %.6f" % (acc_classes[8] * 100.0), "%\t") print("tsignf : %.6f" % (acc_classes[9] * 100.0), "%\t") print("tsignb : %.6f" % (acc_classes[10] * 100.0), "%\t") print("road : %.6f" % (acc_classes[11] * 100.0), "%\t") print("sidewalk: %.6f" % (acc_classes[12] * 100.0), "%\t") print("curbcut : %.6f" % (acc_classes[13] * 100.0), "%\t") print("crosspln: %.6f" % (acc_classes[14] * 100.0), "%\t") print("bikelane: %.6f" % (acc_classes[15] * 100.0), "%\t") print("curb : %.6f" % (acc_classes[16] * 100.0), "%\t") print("fence : %.6f" % (acc_classes[17] * 100.0), "%\t") print("wall : %.6f" % (acc_classes[18] * 100.0), "%\t") print("building: %.6f" % (acc_classes[19] * 100.0), "%\t") print("person : %.6f" % (acc_classes[20] * 100.0), "%\t") print("rider : %.6f" % (acc_classes[21] * 100.0), "%\t") print("sky : %.6f" % (acc_classes[22] * 100.0), "%\t") print("vege : %.6f" % (acc_classes[23] * 100.0), "%\t") print("terrain : %.6f" % (acc_classes[24] * 100.0), "%\t") print("markings: %.6f" % (acc_classes[25] * 100.0), "%\t") print("crosszeb: %.6f" % (acc_classes[26] * 100.0), "%\t") accStr = getColorEntry(accVal) + '{:0.2f}'.format( accVal * 100) + '\033[0m' print("EPOCH ACC on VAL set: ", accStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if (enc and epoch == args.num_epochs): best_acc = 0 if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth' filenameBest = savedir + '/model_best_enc.pth' else: filenameCheckpoint = savedir + '/checkpoint.pth' filenameBest = savedir + '/model_best.pth' save_checkpoint({ 'state_dict': model.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best_each.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best_each.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') #if (True) #(is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') filenameSuperBest = f'{savedir}/model_superbest.pth' if (is_best): torch.save(model.state_dict(), filenameSuperBest) print(f'saving superbest') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) return (model) #return model (convenience for encoder-decoder training)
def train(savedir, model, dataloader_train, dataloader_eval, criterion, optimizer, args, enc=False): min_loss = float('inf') # use tensorboard writer = SummaryWriter(log_dir=savedir) if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists( filenameCheckpoint ), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(mean_and_var) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels, _) in enumerate(dataloader_train): start_time = time.time() #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() #print("image: ", images.size()) #print("labels: ", labels.size()) inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) # print("output: ", outputs.size()) #TODO # print("targets", np.unique(targets[:, 0].cpu().data.numpy())) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image( color_transform( outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image( color_transform( outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) writer.add_scalar('train_loss', average_epoch_loss_train, epoch) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format( iouTrain * 100) + '\033[0m' print("EPOCH IoU on TRAIN set: ", iouStr, "%") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(mean_and_var) for step, (images, labels, _) in enumerate(dataloader_eval): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() optimizer.zero_grad() inputs = Variable(images) targets = Variable(labels) with torch.no_grad(): outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data) time_val.append(time.time() - start_time) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed writer.add_scalar('eval_loss', average_epoch_loss_val, epoch) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print("EPOCH IoU on VAL set: ", iouStr, "%") is_best = average_epoch_loss_val < min_loss min_loss = min(min_loss, average_epoch_loss_val) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint( { 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': min_loss, 'optimizer': optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) writer.close() torch.save(model.state_dict(), f'{savedir}/weight_final.pth') return (model) #return model (convenience for encoder-decoder training)
def train(args, model, classNum, epochNum, encoderOnly=False): start_epoch = 1 best_acc = 0 # === Dataset Processing === # if args.dataset == 'cityscapes': co_transform = MyCoTransform(encoderOnly, dataAugment=True, height=args.height) co_transform_val = MyCoTransform(encoderOnly, dataAugment=False, height=args.height) dataDir = '/media/commlab/TenTB/swhung/SegNet/Cityscapes/' dataset_train = cityscapes(dataDir, co_transform, 'train') dataset_val = cityscapes(dataDir, co_transform_val, 'val') saveDir = f'../save/{args.saveDir}' # # loader_train = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batchSize, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batchSize, shuffle=False) # === Optimization Setting === # # ** optimizer if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) elif args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) # ** learing rate scheduler my_lambda = lambda epoch: pow((1 - ((epoch - 1) / epochNum)), 0.9) # poly scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=my_lambda) # ** apply loss function classWeight = getClassWeight(args.dataset, classNum) if args.cuda: classWeight = classWeight.cuda() criterion = CrossEntropyLoss2d(weight=classWeight, ignore_index=19) # === save information in .txt files === # if (encoderOnly): automated_log_path = saveDir + "/automated_log_encoder.txt" modeltxtpath = saveDir + "/model_txt_encoder.txt" else: automated_log_path = saveDir + "/automated_log.txt" modeltxtpath = saveDir + "/model_txt.txt" if (not os.path.exists(automated_log_path) ): # do not add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) # === Training === # for epoch in range(start_epoch, epochNum + 1): print("----- TRAINING - EPOCH", epoch, "-----") model.train() scheduler.step(epoch - 1) epoch_loss = [] time_train = [] if (args.doEvalTrain): iouEvalTrain = iouEval(classNum) usedLr = 0 for param_group in optimizer.param_groups: print("learning rate: ", param_group['lr']) usedLr = float(param_group['lr']) # ** training iteration for iter, (images, labels) in enumerate(loader_train): start_time = time.time() slice = torch.split(images, 1, 1) rgb = torch.cat((slice[0], slice[1], slice[2]), 1) d = torch.cat((slice[3], slice[4]), 1) #depth and luminance if args.cuda: rgb_inputs = rgb.cuda() d_input = d.cuda() targets = labels.cuda() img_size = list(targets.size())[2:4] # run the model if args.onlyWholeNet: outputs = model(inputs) else: outputs = model(rgb_inputs, d_input, only_encoder=encoderOnly) # run the back-propagation loss = criterion(outputs, targets[:, 0]) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss.append(loss.item()) time_train.append(time.time() - start_time) if (args.doEvalTrain): iouEvalTrain.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) # print the training loss information if args.iter_loss > 0 and iter % args.iter_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, iter: {iter})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batchSize)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (args.doEvalTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format( iouTrain * 100) + '\033[0m' print("EPOCH IoU on TRAIN set: ", iouStr, "%") if epoch <= 10 or epoch >= 70: with torch.no_grad(): # Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (args.doEvalVal): iouEvalVal = iouEval(classNum) # ** valadation iteration for iter, (images, labels) in enumerate(loader_val): start_time = time.time() slice = torch.split(images, 1, 1) rgb = torch.cat((slice[0], slice[1], slice[2]), 1) d = torch.cat((slice[3], slice[4]), 1) #depth and luminance if args.cuda: rgb_inputs = rgb.cuda() d_input = d.cuda() targets = labels.cuda() img_size = list(targets.size())[2:4] # run the model if args.onlyWholeNet: outputs = model(inputs) else: outputs = model(rgb_inputs, d_input, only_encoder=encoderOnly) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.item()) time_val.append(time.time() - start_time) # Add batch to calculate TP, FP and FN for iou estimation if (args.doEvalVal): iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) # print the valadation loss information if args.iter_loss > 0 and iter % args.iter_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, iter: {iter})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batchSize)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) # print epoch val IoU accuracy iouVal = 0 if (args.doEvalVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if encoderOnly: filenameCheckpoint = saveDir + '/checkpoint_enc.pth.tar' filenameBest = saveDir + '/model_best_encoder.pth.tar' else: filenameCheckpoint = saveDir + '/checkpoint.pth.tar' filenameBest = saveDir + '/model_best.pth.tar' save_checkpoint( { 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) if (encoderOnly): filename = f'{saveDir}/model_encoder-{epoch:03}.pth' filenamebest = f'{saveDir}/model_best_encoder.pth' else: filename = f'{saveDir}/model-{epoch:03}.pth' filenamebest = f'{saveDir}/model_best.pth' # save model after some epochs if args.epochs_save > 0 and iter > 0 and iter % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') # save the best model if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not encoderOnly): with open(saveDir + "/best_IoU.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(saveDir + "/best_IoU_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) # save information in .txt files #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write( "\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) return model # return model (convenience for encoder-decoder training)
def train(args, get_dataset, model, enc=False): best_acc = 0 num_epochs = 10 if args.debug else args.num_epochs n_gpus = torch.cuda.device_count() print("\nWorking with {} GPUs".format(n_gpus)) datasets = args.datasets entropy = (args.alpha + args.beta) > 0 if entropy: assert len( datasets ) > 1, "Entropy Module undefined with single dataset. Exiting ... " NUM_LABELS = get_dataset.num_labels dataset_train = { dname: get_dataset(dname, 'train', args.num_samples) for dname in datasets } dataset_val = {dname: get_dataset(dname, 'val', 100) for dname in datasets} # dataset_unlabeled = {dname: get_dataset(dname, co_transform, 'train_extra' , mode='unlabeled') for dname in datasets} dataset_unlabeled = { dname: get_dataset(dname, 'train', mode='unlabeled') for dname in datasets } if entropy: n_unlabeled = np.max( [len(dataset_unlabeled[dname]) for dname in datasets]) print("Working with {} Dataset(s):".format(len(datasets))) for key in datasets: print( "{}: Unlabeled images {}, Training on {} images, Validation on {} images" .format(key, len(dataset_unlabeled[key]), len(dataset_train[key]), len(dataset_val[key]))) for d in datasets: if len(set(dataset_train.values())) != 1: max_train_size = np.max( [len(dataset_train[dname]) for dname in datasets]) dataset_train[d].image_paths = dataset_train[d].image_paths * int( np.ceil( float(max_train_size) / len(dataset_train[d].image_paths))) dataset_train[d].label_paths = dataset_train[d].label_paths * int( np.ceil( float(max_train_size) / len(dataset_train[d].label_paths))) # if entropy: # dataset_unlabeled[d].image_paths = dataset_unlabeled[d].image_paths*int(np.ceil(float(n_unlabeled)/len(dataset_unlabeled[d].image_paths))) # dataset_unlabeled[d].label_paths = dataset_unlabeled[d].label_paths*int(np.ceil(float(n_unlabeled)/len(dataset_unlabeled[d].label_paths))) loader_train = { dname: DataLoader(dataset_train[dname], num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) for dname in datasets } loader_val = { dname: DataLoader(dataset_val[dname], num_workers=args.num_workers, batch_size=2, shuffle=True, drop_last=True) for dname in datasets } # epoch_iters = np.min([len(loader_train[dname]) for dname in datasets]) if entropy: loader_unlabeled = { dname: DataLoader(dataset_unlabeled[dname], num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True, drop_last=True) for dname in datasets } # epoch_iters = np.min([ np.min([len(loader[dname]) for dname in datasets]) for loader in [loader_train , loader_unlabeled]]) savedir = f'../save_drnet/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" loss_logpath = savedir + "/loss_log.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: if len(datasets) > 1: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU-1\t\tTrain-IoU-2\t\tVal-IoU-1\t\tVal-IoU-2\t\tlearningRate" ) else: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tVal-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) if (not os.path.exists(loss_logpath)): with open(loss_logpath, "w") as myfile: if len(datasets) > 1: myfile.write("Epoch\t\tS1\t\tS2\t\tUS1\t\tUS2\t\tTotal\n") else: myfile.write("Epoch\t\tS1\t\tS2\t\tTotal\n") #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 if args.model == 'drnet': optimizer = SGD(model.optim_parameters(), args.lr, 0.9, weight_decay=1e-4) ## scheduler DR-Net if args.cuda: model = torch.nn.DataParallel(model).cuda() doIou = {'train': args.iouTrain, 'val': args.iouVal} le_file = savedir + '/label_embedding.pt' average_epoch_loss = {'train': np.inf, 'val': np.inf} label_embedding = { key: torch.randn(NUM_LABELS[key], args.em_dim).cuda() for key in datasets } ## Random Initialization ## If provided, use label embedddings if args.pt_em: fn = torch.load(args.pt_em) label_embedding = { key: torch.tensor(fn[key], dtype=torch.float).cuda() for key in datasets } start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists( filenameCheckpoint ), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] label_embedding = torch.load(le_file) if len(datasets) > 1 else None print("=> Loaded checkpoint at epoch {}".format(checkpoint['epoch'])) scheduler = lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9)) ## scheduler 2 loss_criterion = { key: torch.nn.CrossEntropyLoss(ignore_index=NUM_LABELS[key] - 1).cuda() for key in datasets } if len(datasets) > 1: similarity_module = EmbeddingLoss(NUM_LABELS, args.em_dim, label_embedding, loss_criterion) similarity_module = torch.nn.DataParallel(similarity_module).cuda() torch.save(label_embedding, le_file) print() print("========== STARTING TRAINING ===========") print() n_iters = min([len(loader_train[d]) for d in datasets]) if entropy: unlabeled_iters = { d: len(loader_unlabeled[d]) // n_iters for d in datasets } for epoch in range(start_epoch, num_epochs + 1): epoch_start_time = time.time() usedLr = 0 iou = {key: (0, 0) for key in datasets} ###### TRAIN begins ################# for phase in ['train']: eval_iou = doIou[phase] print("-----", phase, "- EPOCH", epoch, "-----") scheduler.step(epoch) model.train() for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) ## Initialize the iterables labeled_iterator = { dname: iter(loader_train[dname]) for dname in datasets } if entropy: unlabeled_iterator = { dname: iter(loader_unlabeled[dname]) for dname in datasets } if args.alpha: alpha = 1 if args.beta: beta = 1 epoch_loss = {d: [] for d in datasets} epoch_sup_loss = {d: [] for d in datasets} epoch_ent_loss = {d: [] for d in datasets} time_taken = [] if (eval_iou): iou_data = {key: iouEval(NUM_LABELS[key]) for key in datasets} for itr in range(n_iters): optimizer.zero_grad() loss_sup = {d: 0 for d in datasets} loss_ent = {d: [0] for d in datasets} for d in datasets: images_l, targets_l = next(labeled_iterator[d]) images_l = images_l.cuda() targets_l = targets_l.cuda() start_time = time.time() dec_outputs = model(images_l, enc=False, finetune=args.finetune) loss_s = loss_criterion[d](dec_outputs[d], targets_l.squeeze(1)) loss_s.backward() loss_sup[d] = loss_s.item() if entropy: for _ in range(unlabeled_iters[d]): images_u = next(unlabeled_iterator[d]) images_u = images_u.cuda() _, en_outputs = model(images_u) loss_e = torch.mean( similarity_module( en_outputs, d, args.alpha, args.beta)) ## unsupervised losses loss_e /= unlabeled_iters[d] loss_e.backward() loss_ent[d].append(loss_e.item()) epoch_sup_loss[d].append(loss_sup[d]) epoch_ent_loss[d].extend(loss_ent[d]) epoch_loss[d].append( loss_sup[d] + np.sum(loss_ent[d])) ## Already averaged over iters time_taken.append(time.time() - start_time) optimizer.step() if args.steps_loss > 0 and (itr % args.steps_loss == 0 or itr == n_iters - 1): average = { d: np.around(sum(epoch_loss[d]) / len(epoch_loss[d]), 3) for d in datasets } print( f'{phase} loss: {average} (epoch: {epoch}, step: {itr})', "// Avg time/img: %.4f s" % (sum(time_taken) / len(time_taken) / args.batch_size)) average = {d: np.mean(epoch_loss[d]) for d in datasets} average_epoch_loss[phase] = sum(average.values()) if entropy: average_epoch_sup_loss = { d: np.mean(epoch_sup_loss[d]) for d in datasets } average_epoch_ent_loss = { d: np.mean(epoch_ent_loss[d]) for d in datasets } ## Write the epoch wise supervised and total unsupervised losses. with open(loss_logpath, "a") as myfile: if len(datasets) > 1 and (itr % args.steps_loss == 0 or itr == n_iters - 1): myfile.write( "%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\n" % (epoch, average_epoch_sup_loss.get(datasets[0], 0), average_epoch_sup_loss.get(datasets[1], 0), average_epoch_ent_loss.get(datasets[0], 0), average_epoch_ent_loss.get(datasets[1], 0), average_epoch_loss[phase])) ## Todo: A better way to close the worker threads. for d in datasets: while True: try: _ = next(labeled_iterator[d]) except StopIteration: break if entropy: while True: try: _ = next(unlabeled_iterator[d]) except StopIteration: break iou = {key: (0, 0) for key in datasets} if (eval_iou): iou = {key: iou_data[key].getIoU() for key in datasets} iouStr_label = { key: '{:0.2f}'.format(iou[key][0] * 100) for key in datasets } for d in datasets: print("EPOCH IoU on {} dataset: {} %".format( d, iouStr_label[d])) ########## Train ends ############################### ##### Validation ############### if (epoch == 1) or (epoch % 5 == 0): ## validation after every 5 epoch for phase in ['val']: eval_iou = doIou[phase] print("-----", phase, "- EPOCH", epoch, "-----") model.eval() if (eval_iou): iou_data = {d: iouEval(NUM_LABELS[d]) for d in datasets} epoch_val_loss = {d: [] for d in datasets} if args.pAcc: pAcc = {d: [] for d in datasets} for d in datasets: time_taken = [] for itr, (images, targets) in enumerate(loader_val[d]): start_time = time.time() images = images.cuda() targets = targets.cuda() with torch.set_grad_enabled(False): seg_output = model(images, enc=False) loss = loss_criterion[d](seg_output[d], targets.squeeze(1)) if eval_iou: pred = seg_output[d].argmax(1, True).data iou_data[d].addBatch(pred, targets.data) if args.pAcc: a = (pred == targets.data) pAcc[d].append(torch.mean(a.double())) epoch_val_loss[d].append(loss.item()) time_taken.append(time.time() - start_time) if args.steps_loss > 0 and (itr % args.steps_loss == 0 or itr == len(loader_val[d]) - 1): average = np.around(np.mean(epoch_val_loss[d]), 3) print( f'{d}: {phase} loss: {average} (epoch: {epoch}, step: {itr})', "// Avg time/img: %.4f s" % (sum(time_taken) / len(time_taken) / args.batch_size)) average_epoch_loss[phase] = np.sum( [np.mean(epoch_val_loss[d]) for d in datasets]) if (eval_iou): iou = {d: iou_data[d].getIoU() for d in datasets} iouStr_label = { d: '{:0.2f}'.format(iou[d][0] * 100) for d in datasets } for d in datasets: print("EPOCH IoU on {} dataset: {} %".format( d, iouStr_label[d])) if args.pAcc: print(f'{d}: pAcc : {np.mean(pAcc[d])*100}%') ############# VALIDATION ends ####################### print("Epoch time {} s".format(time.time() - epoch_start_time)) # remember best valIoU and save checkpoint if sum([iou[key][0] for key in datasets]) == 0: current_acc = -average_epoch_loss['val'] else: current_acc = sum([iou[key][0] for key in datasets]) / len( datasets) ## Average of the IoUs to save best model is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint( { 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and epoch > 0 and epoch % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d\n" % (epoch)) for d in datasets: myfile.write("Val-IoU-%s= %.4f\n" % (d, iou[d][0])) myfile.write("\n\n") for d in datasets: myfile.write( "Classwise IoU for best epoch in %s is ... \n" % (d)) for values in iou[d][1]: myfile.write("%.4f " % (values)) myfile.write("\n\n") with open(automated_log_path, "a") as myfile: iouTrain = 0 if len(datasets) > 1: myfile.write( "\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss['train'], average_epoch_loss['val'], iouTrain, iouTrain, iou[datasets[0]][0], iou[datasets[1]][0], usedLr)) else: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss['train'], average_epoch_loss['val'], iouTrain, iou[datasets[0]][0], usedLr)) return (model)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) model = FSFNet(NUM_CLASSES) #model = torch.nn.DataParallel(model) if (not args.cpu): model = torch.nn.DataParallel(model).cuda() def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): # print(name) # print(param) if name not in own_state: if name.startswith("module."): own_state[name.split("module.")[-1]].copy_(param) else: print(name, " not loaded") continue else: own_state[name].copy_(param) return model model = load_my_state_dict( model, torch.load(weightspath, map_location=lambda storage, loc: storage)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(camvid(args.datadir, input_transform_camvid, target_transform_camvid, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) iouEvalVal = iouEval(NUM_CLASSES) start = time.time() for step, (images, labels, filename, filenameGt) in enumerate(loader): if (not args.cpu): images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) outputs = model(inputs) iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, labels) filenameSave = filename[0].split("images/")[1] print(step, filenameSave) iouVal, iou_classes = iouEvalVal.getIoU() iou_classes_str = [] for i in range(iou_classes.size(0)): iouStr = getColorEntry(iou_classes[i]) + '{:0.2f}'.format( iou_classes[i] * 100) + '\033[0m' iou_classes_str.append(iouStr) print("---------------------------------------") print("Took ", time.time() - start, "seconds") print("=======================================") #print("TOTAL IOU: ", iou * 100, "%") print("Per-Class IoU:") print(iou_classes_str[0], "Sky") print(iou_classes_str[1], "Building") print(iou_classes_str[2], "Pole") print(iou_classes_str[3], "Road") print(iou_classes_str[4], "Pavement") print(iou_classes_str[5], "Tree") print(iou_classes_str[6], "SignSymbol") print(iou_classes_str[7], "Fence") print(iou_classes_str[8], "Car") print(iou_classes_str[9], "Pedestrian") print(iou_classes_str[10], "Bicyclist") print("=======================================") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(iouVal * 100) + '\033[0m' print("MEAN IoU: ", iouStr, "%")
def train(args, rmodel, model, enc=False): best_acc = 0 weight = classWeights(NUM_CLASSES) assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(augment=True, height=args.height) co_transform_val = MyCoTransform(augment=False, height=args.height) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() rcriterion = torch.nn.L1Loss() savedir = '/home/shyam.nandan/NewExp/F_erfnet_pytorch_ours_w_gt_v2_multiply/save/' + args.savedir #change path if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## roptimizer = Adam(rmodel.parameters(), 2e-4, (0.9, 0.999)) ## restoration scheduler start_epoch = 1 scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30, gamma=0.5) ## Restoration schedular for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step() ## scheduler 2 rscheduler.step() epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 rusedLr = 0 for param_group in optimizer.param_groups: print("Segmentation LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) for param_group in roptimizer.param_groups: print("Restoration LEARNING RATE: ", param_group['lr']) rusedLr = float(param_group['lr']) model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (timages, images, labels, filename) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable( timages, volatile=True ) #volatile flag makes it free backward or outputs for eval itargets = Variable(images, volatile=True) targets = Variable(labels, volatile=True) ss_inputs = rmodel(inputs, flag=0, r_fb1=0, r_fb2=0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag=1, r_fb1=(tplus_outs - tminus_outs), r_fb2=ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) outs = model(ss_inputs.detach(), only_encode=enc) tminus_outs = tplus_outs tplus_outs = outs.detach() outputs = outs del outs, tminus_outs, tplus_outs gc.collect() Gamma = [0, 0, 0] Alpha = [1, 1, 1] loss = CB_iFl(outputs, targets[:, 0], weight, gamma=Gamma[0], alpha=Alpha[0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): #start_time_iou = time.time() iouEvalVal_img = iouEval(NUM_CLASSES) iouEvalVal_img.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) label_color = Colorize()( outputs[0].max(0)[1].byte().cpu().data.unsqueeze(0)) label_save = ToPILImage()(label_color) filenameSave = '../save_color_restored_joint_afl_CBFL/' + filename[ 0].split('/')[-2] im_iou, _ = iouEvalVal_img.getIoU() if not os.path.exists(filenameSave): os.makedirs(filenameSave) #Uncomment to save output #label_save.save(filenameSave+ '/' + str(" %6.4f " %im_iou[0].data.numpy()) + '_' + filename[0].split('/')[-1]) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print('Val loss: ', average, 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print(iouVal, iou_classes, iouStr) return (model)
def train(args, rmodel, model, enc=False): best_acc = 0 weight = classWeights(NUM_CLASSES) assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(augment=True, height=args.height) co_transform_val = MyCoTransform(augment=False, height=args.height) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() rcriterion = torch.nn.L1Loss() savedir = '/home/shyam.nandan/NewExp/final_code/save/' + args.savedir automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),eps=1e-08, weight_decay=2e-4) roptimizer = Adam(rmodel.parameters(), 2e-4, (0.9, 0.999)) start_epoch = 1 scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30, gamma=0.5) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step() rscheduler.step() epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 rusedLr = 0 for param_group in optimizer.param_groups: print("Segmentation LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) for param_group in roptimizer.param_groups: print("Restoration LEARNING RATE: ", param_group['lr']) rusedLr = float(param_group['lr']) model.train() for step, (timages, images, labels) in enumerate(loader): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable(timages) itargets = Variable(images) targets = Variable(labels) ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() outputs = [] for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) loss.backward() roptimizer.step() optimizer.zero_grad() roptimizer.zero_grad() outs = model(ss_inputs.detach(),only_encode=enc) outputs.append(outs) tminus_outs = tplus_outs tplus_outs = outs.detach() del outs, tminus_outs, tplus_outs gc.collect() loss = 0.0 Gamma = [0, 0.1, 0.2] Alpha = [1, 1, 1] for i, o in enumerate(outputs): loss += CB_iFl(o, targets[:, 0], weight, gamma = Gamma[i], alpha = Alpha[i]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if (doIouTrain): iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print('loss: ', average.data.cpu()[0], 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (timages, images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable(timages, volatile=True) itargets = Variable(images, volatile=True) targets = Variable(labels, volatile=True) ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) outs = model(ss_inputs.detach(),only_encode=enc) tminus_outs = tplus_outs tplus_outs = outs.detach() ################################## del ss_inputs, tplus_outs, tminus_outs outputs = outs loss = CB_iFl(outputs, targets[:, 0], weight, gamma = Gamma[0], alpha = Alpha[0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print('Val loss: ', average, 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH filename = savedir + '/model-{epoch:03}.pth' filenamebest = savedir + '/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(filename, epoch) if (is_best): torch.save(model.state_dict(), filenamebest) torch.save(rmodel.state_dict(), savedir + '/rmodel_best.pth') print(filenamebest,epoch) with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model)