def train(args, model, enc=False): global best_acc weight = torch.ones(NUM_CLASSES) weight[0] = 121.21 weight[1] = 947.02 weight[2] = 151.92 weight[3] = 428.31 weight[4] = 25.88 weight[5] = 235.97 weight[6] = 885.72 weight[7] = 911.87 weight[8] = 307.49 weight[9] = 204.69 weight[10] = 813.92 weight[11] = 5.83 weight[12] = 34.22 weight[13] = 453.34 weight[14] = 346.10 weight[15] = 250.19 weight[16] = 119.99 weight[17] = 75.28 weight[18] = 76.71 weight[19] = 8.58 weight[20] = 281.68 weight[21] = 924.07 weight[22] = 3.91 weight[23] = 7.14 weight[24] = 88.89 weight[25] = 59.00 weight[26] = 126.59 weight[27] = 0 assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height) #1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height) #1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: #criterion =LovaszLoss2d() #criterion = CrossEntropyLoss2d(weight.cuda()) criterion = FocalLoss2d(weight.cuda()) else: #criterion = LovaszLoss2d() #criterion = CrossEntropyLoss2d(weight) criterion = FocalLoss2d(weight.cuda()) print(type(criterion)) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 1e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 time_train_perepoch = [] for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") start_time_perepoch = time.time() scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels) in enumerate(loader): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() #inputs = images #targets= labels inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) #loss = criterion(outputs, targets) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) #evalIoU.printConfMatrix(confMatrix, evalIoU.args) time_train_perepoch.append(time.time() - start_time_perepoch) print("// Time per epoch: %.4f hours" % (sum(time_train_perepoch) / len(time_train_perepoch) / 3600.0)) #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) with torch.no_grad(): for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() #inputs =images #targets=labels inputs = Variable( images, requires_grad=False ) #, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, requires_grad=False) #, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed # Calculate IOU scores on class level from matrix iouVal = 0 iouTrain = 0 if (doIouVal): iouVal, iou_classes, accVal, acc_classes = iouEvalVal.getIoU() print("pole : %.6f" % (iou_classes[0] * 100.0), "%\t") print("slight : %.6f" % (iou_classes[1] * 100.0), "%\t") print("bboard : %.6f" % (iou_classes[2] * 100.0), "%\t") print("tlight : %.6f" % (iou_classes[3] * 100.0), "%\t") print("car : %.6f" % (iou_classes[4] * 100.0), "%\t") print("truck : %.6f" % (iou_classes[5] * 100.0), "%\t") print("bicycle : %.6f" % (iou_classes[6] * 100.0), "%\t") print("motor : %.6f" % (iou_classes[7] * 100.0), "%\t") print("bus : %.6f" % (iou_classes[8] * 100.0), "%\t") print("tsignf : %.6f" % (iou_classes[9] * 100.0), "%\t") print("tsignb : %.6f" % (iou_classes[10] * 100.0), "%\t") print("road : %.6f" % (iou_classes[11] * 100.0), "%\t") print("sidewalk: %.6f" % (iou_classes[12] * 100.0), "%\t") print("curbcut : %.6f" % (iou_classes[13] * 100.0), "%\t") print("crosspln: %.6f" % (iou_classes[14] * 100.0), "%\t") print("bikelane: %.6f" % (iou_classes[15] * 100.0), "%\t") print("curb : %.6f" % (iou_classes[16] * 100.0), "%\t") print("fence : %.6f" % (iou_classes[17] * 100.0), "%\t") print("wall : %.6f" % (iou_classes[18] * 100.0), "%\t") print("building: %.6f" % (iou_classes[19] * 100.0), "%\t") print("person : %.6f" % (iou_classes[20] * 100.0), "%\t") print("rider : %.6f" % (iou_classes[21] * 100.0), "%\t") print("sky : %.6f" % (iou_classes[22] * 100.0), "%\t") print("vege : %.6f" % (iou_classes[23] * 100.0), "%\t") print("terrain : %.6f" % (iou_classes[24] * 100.0), "%\t") print("markings: %.6f" % (iou_classes[25] * 100.0), "%\t") print("crosszeb: %.6f" % (iou_classes[26] * 100.0), "%\t") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print("EPOCH IoU on VAL set: ", iouStr, "%") print("pole : %.6f" % (acc_classes[0] * 100.0), "%\t") print("slight : %.6f" % (acc_classes[1] * 100.0), "%\t") print("bboard : %.6f" % (acc_classes[2] * 100.0), "%\t") print("tlight : %.6f" % (acc_classes[3] * 100.0), "%\t") print("car : %.6f" % (acc_classes[4] * 100.0), "%\t") print("truck : %.6f" % (acc_classes[5] * 100.0), "%\t") print("bicycle : %.6f" % (acc_classes[6] * 100.0), "%\t") print("motor : %.6f" % (acc_classes[7] * 100.0), "%\t") print("bus : %.6f" % (acc_classes[8] * 100.0), "%\t") print("tsignf : %.6f" % (acc_classes[9] * 100.0), "%\t") print("tsignb : %.6f" % (acc_classes[10] * 100.0), "%\t") print("road : %.6f" % (acc_classes[11] * 100.0), "%\t") print("sidewalk: %.6f" % (acc_classes[12] * 100.0), "%\t") print("curbcut : %.6f" % (acc_classes[13] * 100.0), "%\t") print("crosspln: %.6f" % (acc_classes[14] * 100.0), "%\t") print("bikelane: %.6f" % (acc_classes[15] * 100.0), "%\t") print("curb : %.6f" % (acc_classes[16] * 100.0), "%\t") print("fence : %.6f" % (acc_classes[17] * 100.0), "%\t") print("wall : %.6f" % (acc_classes[18] * 100.0), "%\t") print("building: %.6f" % (acc_classes[19] * 100.0), "%\t") print("person : %.6f" % (acc_classes[20] * 100.0), "%\t") print("rider : %.6f" % (acc_classes[21] * 100.0), "%\t") print("sky : %.6f" % (acc_classes[22] * 100.0), "%\t") print("vege : %.6f" % (acc_classes[23] * 100.0), "%\t") print("terrain : %.6f" % (acc_classes[24] * 100.0), "%\t") print("markings: %.6f" % (acc_classes[25] * 100.0), "%\t") print("crosszeb: %.6f" % (acc_classes[26] * 100.0), "%\t") accStr = getColorEntry(accVal) + '{:0.2f}'.format( accVal * 100) + '\033[0m' print("EPOCH ACC on VAL set: ", accStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if (enc and epoch == args.num_epochs): best_acc = 0 if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth' filenameBest = savedir + '/model_best_enc.pth' else: filenameCheckpoint = savedir + '/checkpoint.pth' filenameBest = savedir + '/model_best.pth' save_checkpoint({ 'state_dict': model.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best_each.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best_each.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') #if (True) #(is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') filenameSuperBest = f'{savedir}/model_superbest.pth' if (is_best): torch.save(model.state_dict(), filenameSuperBest) print(f'saving superbest') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) return (model) #return model (convenience for encoder-decoder training)
def train(args, model, enc=False): global best_acc weight = torch.ones(NUM_CLASSES) weight[0] = 121.21 weight[1] = 947.02 weight[2] = 151.92 weight[3] = 428.31 weight[4] = 25.88 weight[5] = 235.97 weight[6] = 885.72 weight[7] = 911.87 weight[8] = 307.49 weight[9] = 204.69 weight[10] = 813.92 weight[11] = 5.83 weight[12] = 34.22 weight[13] = 453.34 weight[14] = 346.10 weight[15] = 250.19 weight[16] = 119.99 weight[17] = 75.28 weight[18] = 76.71 weight[19] = 8.58 weight[20] = 281.68 weight[21] = 924.07 weight[22] = 3.91 weight[23] = 7.14 weight[24] = 88.89 weight[25] = 59.00 weight[26] = 126.59 weight[27] = 0 assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height) #1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height) #1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: #criterion = CrossEntropyLoss2d(weight.cuda()) criterion = FocalLoss2d(weight.cuda()) else: #criterion = CrossEntropyLoss2d(weight) criterion = FocalLoss2d(weight.cuda()) print(type(criterion)) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-5, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 2 start_epoch = 1 #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal #TODO: remake the evalIoU.py code to avoid using "evalIoU.args" confMatrix = evalIoU.generateMatrixTrainId(evalIoU.args) perImageStats = {} nbPixels = 0 usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels) in enumerate(loader): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) #loss = criterion(outputs, targets) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) #evalIoU.printConfMatrix(confMatrix, evalIoU.args) #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] #New confusion matrix data confMatrix = evalIoU.generateMatrixTrainId(evalIoU.args) perImageStats = {} nbPixels = 0 for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable( images, volatile=True ) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) #Add outputs to confusion matrix if (doIouVal): #compatibility with criterion dataparallel if isinstance(outputs, list): #merge gpu tensors outputs_cpu = outputs[0].cpu() for i in range(1, len(outputs)): outputs_cpu = torch.cat( (outputs_cpu, outputs[i].cpu()), 0) #print(outputs_cpu.size()) else: outputs_cpu = outputs.cpu() #start_time_iou = time.time() for i in range(0, outputs_cpu.size(0)): #args.batch_size prediction = ToPILImage()( outputs_cpu[i].max(0)[1].data.unsqueeze(0).byte()) groundtruth = ToPILImage()(labels[i].cpu().byte()) nbPixels += evalIoU.evaluatePairPytorch( prediction, groundtruth, confMatrix, perImageStats, evalIoU.args) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed # Calculate IOU scores on class level from matrix iouVal = 0 iouTrain = 0 if (doIouVal): #start_time_iou = time.time() classScoreList = {} for label in evalIoU.args.evalLabels: labelName = evalIoU.trainId2label[label].name classScoreList[labelName] = evalIoU.getIouScoreForTrainLabel( label, confMatrix, evalIoU.args) iouAvgStr = evalIoU.getColorEntry( evalIoU.getScoreAverage(classScoreList, evalIoU.args), evalIoU.args) + "{avg:5.3f}".format( avg=evalIoU.getScoreAverage( classScoreList, evalIoU.args)) + evalIoU.args.nocol iouVal = float( evalIoU.getScoreAverage(classScoreList, evalIoU.args)) print("EPOCH IoU on VAL set: ", iouAvgStr) # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if (enc and epoch == args.num_epochs): best_acc = 0 if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth' filenameBest = savedir + '/model_best_enc.pth' else: filenameCheckpoint = savedir + '/checkpoint.pth' filenameBest = savedir + '/model_best.pth' save_checkpoint({ 'state_dict': model.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best_each.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best_each.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') #if (True) #(is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') filenameSuperBest = f'{savedir}/model_superbest.pth' if (is_best): torch.save(model.state_dict(), filenameSuperBest) print(f'saving superbest') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) return (model) #return model (convenience for encoder-decoder training)
def train(args, model, enc=False): global best_acc #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) if (enc): weight[0] = 2.3653597831726 weight[1] = 4.4237880706787 weight[2] = 2.9691488742828 weight[3] = 5.3442072868347 weight[4] = 5.2983593940735 weight[5] = 5.2275490760803 weight[6] = 5.4394111633301 weight[7] = 5.3659925460815 weight[8] = 3.4170460700989 weight[9] = 5.2414722442627 weight[10] = 4.7376127243042 weight[11] = 5.2286224365234 weight[12] = 5.455126285553 weight[13] = 4.3019247055054 weight[14] = 5.4264230728149 weight[15] = 5.4331531524658 weight[16] = 5.433765411377 weight[17] = 5.4631009101868 weight[18] = 5.3947434425354 else: weight[0] = 2.8149201869965 weight[1] = 6.9850029945374 weight[2] = 3.7890393733978 weight[3] = 9.9428062438965 weight[4] = 9.7702074050903 weight[5] = 9.5110931396484 weight[6] = 10.311357498169 weight[7] = 10.026463508606 weight[8] = 4.6323022842407 weight[9] = 9.5608062744141 weight[10] = 7.8698215484619 weight[11] = 9.5168733596802 weight[12] = 10.373730659485 weight[13] = 6.6616044044495 weight[14] = 10.260489463806 weight[15] = 10.287888526917 weight[16] = 10.289801597595 weight[17] = 10.405355453491 weight[18] = 10.138095855713 weight[19] = 0 #loader = DataLoader(VOC12(args.datadir, input_transform, target_transform), # num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height) #1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height) #1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: criterion = CrossEntropyLoss2d(weight.cuda()) #criterion = CriterionDataParallel(criterion).cuda() else: criterion = CrossEntropyLoss2d(weight) print(type(criterion)) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 """ #Some optimizer examples: optimizer = Adam(model.parameters()) if args.model.startswith('FCN'): optimizer = SGD(model.parameters(), 1e-4, .9, 2e-5) if args.model.startswith('PSP'): optimizer = SGD(model.parameters(), 1e-2, .9, 1e-4) if args.model.startswith('Seg'): optimizer = SGD(model.parameters(), 1e-3, .9) if args.model.startswith('E'): #optimizer = Adam(model.parameters(), 1e-3, .9) optimizer = Adam(model.parameters(), 5e-4, .9, weight_decay=2e-4) #5e-4 wd: 2e-4 """ #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists( filenameCheckpoint ), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal #TODO: remake the evalIoU.py code to avoid using "evalIoU.args" confMatrix = evalIoU.generateMatrixTrainId(evalIoU.args) perImageStats = {} nbPixels = 0 usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels) in enumerate(loader): start_time = time.time() #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) #print (outputs_cpu.size()) #Add outputs to confusion matrix #CODE USING evalIoU.py remade from cityscapes/scripts/evaluation/evalPixelLevelSemanticLabeling.py if (doIouTrain): #compatibility with criterion dataparallel if isinstance(outputs, list): #merge gpu tensors outputs_cpu = outputs[0].cpu() for i in range(1, len(outputs)): outputs_cpu = torch.cat( (outputs_cpu, outputs[i].cpu()), 0) #print(outputs_cpu.size()) else: outputs_cpu = outputs.cpu() #start_time_iou = time.time() for i in range(0, outputs_cpu.size(0)): #args.batch_size prediction = ToPILImage()( outputs_cpu[i].max(0)[1].data.unsqueeze(0).byte()) groundtruth = ToPILImage()(labels[i].cpu().byte()) nbPixels += evalIoU.evaluatePairPytorch( prediction, groundtruth, confMatrix, perImageStats, evalIoU.args) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image( color_transform( outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image( color_transform( outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) #evalIoU.printConfMatrix(confMatrix, evalIoU.args) iouTrain = 0 if (doIouTrain): # Calculate IOU scores on class level from matrix classScoreList = {} for label in evalIoU.args.evalLabels: labelName = evalIoU.trainId2label[label].name classScoreList[labelName] = evalIoU.getIouScoreForTrainLabel( label, confMatrix, evalIoU.args) iouAvgStr = evalIoU.getColorEntry( evalIoU.getScoreAverage(classScoreList, evalIoU.args), evalIoU.args) + "{avg:5.3f}".format( avg=evalIoU.getScoreAverage( classScoreList, evalIoU.args)) + evalIoU.args.nocol iouTrain = float( evalIoU.getScoreAverage(classScoreList, evalIoU.args)) print("EPOCH IoU on TRAIN set: ", iouAvgStr) #print("") #evalIoU.printClassScoresPytorchTrain(classScoreList, evalIoU.args) #print("--------------------------------") #print("Score Average : " + iouAvgStr )#+ " " + niouAvgStr) #print("--------------------------------") #print("") #input ("Press key to continue...") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] #New confusion matrix data confMatrix = evalIoU.generateMatrixTrainId(evalIoU.args) perImageStats = {} nbPixels = 0 for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable( images, volatile=True ) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) #Add outputs to confusion matrix if (doIouVal): #compatibility with criterion dataparallel if isinstance(outputs, list): #merge gpu tensors outputs_cpu = outputs[0].cpu() for i in range(1, len(outputs)): outputs_cpu = torch.cat( (outputs_cpu, outputs[i].cpu()), 0) #print(outputs_cpu.size()) else: outputs_cpu = outputs.cpu() #start_time_iou = time.time() for i in range(0, outputs_cpu.size(0)): #args.batch_size prediction = ToPILImage()( outputs_cpu[i].max(0)[1].data.unsqueeze(0).byte()) groundtruth = ToPILImage()(labels[i].cpu().byte()) nbPixels += evalIoU.evaluatePairPytorch( prediction, groundtruth, confMatrix, perImageStats, evalIoU.args) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data board.image(image, f'VAL input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image( color_transform( outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') else: board.image( color_transform( outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'VAL target (epoch: {epoch}, step: {step})') print("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed # Calculate IOU scores on class level from matrix iouVal = 0 if (doIouVal): #start_time_iou = time.time() classScoreList = {} for label in evalIoU.args.evalLabels: labelName = evalIoU.trainId2label[label].name classScoreList[labelName] = evalIoU.getIouScoreForTrainLabel( label, confMatrix, evalIoU.args) iouAvgStr = evalIoU.getColorEntry( evalIoU.getScoreAverage(classScoreList, evalIoU.args), evalIoU.args) + "{avg:5.3f}".format( avg=evalIoU.getScoreAverage( classScoreList, evalIoU.args)) + evalIoU.args.nocol iouVal = float( evalIoU.getScoreAverage(classScoreList, evalIoU.args)) print("EPOCH IoU on VAL set: ", iouAvgStr) #print("") #evalIoU.printClassScoresPytorchTrain(classScoreList, evalIoU.args) #print("--------------------------------") #print("Score Average : " + iouAvgStr )#+ " " + niouAvgStr) #print("--------------------------------") #print("") #print ("Time to calculate confusion matrix: ", time.time() - start_time_iou) #input ("Press key to continue...") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint( { 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) return (model) #return model (convenience for encoder-decoder training)
def train(args, model, enc): global best_acc #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) weight[0] = 1 weight[1] = 1 weight[2] = 1 weight[3] = 1 weight[4] = 1 weight[5] = 1 weight[6] = 1 weight[7] = 1 weight[8] = 1 weight[9] = 1 assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" #Loading the dataset co_transform = MyCoTransform(False, augment=True, height=args.height)#1024) co_transform_val = MyCoTransform(False, augment=False, height=args.height)#1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'test') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: criterion = CrossEntropyLoss2dv2(weight.cuda()) else: criterion = CrossEntropyLoss2dv2(weight) savedir = '../save/'+args.savedir automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) # We use Adam optimizer with lr of 5e-4 optimizer = Adam([ {'params' : model.parameters()},], 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. filenameCheckpoint = savedir + '/checkpoint.pth.tar'#'/model_best.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 cont_train_loss = [] cont_val_loss = [] for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal #TODO: remake the evalIoU.py code to avoid using "evalIoU.args" confMatrix = evalIoU.generateMatrixTrainId(evalIoU.args) perImageStats = {} nbPixels = 0 usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images,oldimages, labels, filename, filenameGt) in enumerate(loader): start_time = time.time() break if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs, road_mask = model(inputs) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if (doIouTrain): #compatibility with criterion dataparallel if isinstance(outputs, list): #merge gpu tensors outputs_cpu = outputs[0].cpu() for i in range(1,len(outputs)): outputs_cpu = torch.cat((outputs_cpu, outputs[i].cpu()), 0) else: outputs_cpu = outputs.cpu() #start_time_iou = time.time() for i in range(0, outputs_cpu.size(0)): #args.batch_size prediction = ToPILImage()(outputs_cpu[i].max(0)[1].data.unsqueeze(0).byte()) groundtruth = ToPILImage()(labels[i].cpu().byte()) nbPixels += evalIoU.evaluatePairPytorch(prediction, groundtruth, confMatrix, perImageStats, evalIoU.args) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if not args.eval: average_epoch_loss_train = 0#sum(epoch_loss) / len(epoch_loss) else : average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) #evalIoU.printConfMatrix(confMatrix, evalIoU.args) iouTrain = 0 if (doIouTrain ): # Calculate IOU scores on class level from matrix classScoreList = {} for label in evalIoU.args.evalLabels: labelName = evalIoU.trainId2label[label].name classScoreList[labelName] = evalIoU.getIouScoreForTrainLabel(label, confMatrix, evalIoU.args) print(classScoreList) iouAvgStr = evalIoU.getColorEntry(evalIoU.getScoreAverage(classScoreList, evalIoU.args), evalIoU.args) + "{avg:5.3f}".format(avg=evalIoU.getScoreAverage(classScoreList, evalIoU.args)) + evalIoU.args.nocol iouTrain = float(evalIoU.getScoreAverage(classScoreList, evalIoU.args)) print ("EPOCH IoU on TRAIN set: ", iouAvgStr) evalIoU.printClassScoresPytorchTrain(classScoreList, evalIoU.args) print("--------------------------------") print("Score Average : " + iouAvgStr )#+ " " + niouAvgStr) print("--------------------------------") #Validate on val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() #model = pretrained_model epoch_loss_val = [] time_val = [] #New confusion matrix data confMatrix = evalIoU.generateMatrixTrainId(evalIoU.args) perImageStats = {} nbPixels = 0 val_ct = 0 for step, (images, oldimages, labels, filename, filenameGt) in enumerate(loader_val): start_time = time.time() #break if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs, road_mask = model(inputs) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) #Add outputs to confusion matrix if (doIouVal): #compatibility with criterion dataparallel if isinstance(outputs, list): #merge gpu tensors outputs_cpu = outputs[0].cpu() for i in range(1,len(outputs)): outputs_cpu = torch.cat((outputs_cpu, outputs[i].cpu()), 0) else: outputs_cpu = outputs.cpu() targets_cpu = targets.cpu() start_time_iou = time.time() for i in range(0, outputs_cpu.size(0)): #args.batch_size val_ct += 1 pred_img = outputs_cpu[i].max(0)[1].data.unsqueeze(0) roadMask = road_mask[i].data.cpu() #print(type(roadMask)) pred_img[roadMask == 0] = 255 #predictionClr = ToPILImage()(Colorize()(pred_img.byte())) prediction = ToPILImage()(pred_img.byte()) #filenameSave = "./save_color_res/" + str(val_ct).zfill(3)+'.png' #filename_break = str(filename[0]).split('/') #filename_path = '/'.join(filename_break[-3:]) #filenameSave = "./save_color_res/" + str(filename_path) #os.makedirs(os.path.dirname(filenameSave), exist_ok=True) #predictionClr.save(filenameSave) groundtruth = ToPILImage()(labels[i].cpu().byte()) nbPixels += evalIoU.evaluatePairPytorch(prediction, groundtruth, confMatrix, perImageStats, evalIoU.args) print ("Time to add confusion matrix: ", time.time() - start_time_iou) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) print(doIouVal) # Calculate IOU scores on class level from matrix iouVal = 0 confMatrix= confMatrix[:12,:12] if (doIouVal): #start_time_iou = time.time() classScoreList = {} for label in evalIoU.args.evalLabels: labelName = evalIoU.trainId2label[label].name classScoreList[labelName] = evalIoU.getIouScoreForTrainLabel(label, confMatrix, evalIoU.args) print(classScoreList) iouAvgStr = evalIoU.getColorEntry(evalIoU.getScoreAverage(classScoreList, evalIoU.args), evalIoU.args) + "{avg:5.3f}".format(avg=evalIoU.getScoreAverage(classScoreList, evalIoU.args)) + evalIoU.args.nocol iouVal = float(evalIoU.getScoreAverage(classScoreList, evalIoU.args)) print ("EPOCH IoU on VAL set: ", iouAvgStr) #print("") #evalIoU.printClassScoresPytorchTrain(classScoreList, evalIoU.args) #print("--------------------------------") #print("Score Average : " + iouAvgStr )#+ " " + niouAvgStr) #print("--------------------------------") #print("") #print ("Time to calculate confusion matrix: ", time.time() - start_time_iou) #input ("Press key to continue...") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH filename = savedir+'/model-'+str(epoch)+'}.pth' filenamebest = savedir+'/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print('save: {'+filename+'} (epoch: {'+str(epoch)+'})') if (is_best): torch.save(model.state_dict(), filenamebest) print('save: {'+filenamebest+'} (epoch: {'+str(epoch)+'})') with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model) #return model (convenience for encoder-decoder training)