def main(): env_name = 'CartPole-v0' env = gym.make(env_name) action_space = env.action_space.n observation_space = env.observation_space.low.shape # set logger logging.config.fileConfig('./log/log.conf') logger = logging.getLogger(__name__) logger.info('START') # set network model shared_model = A3CFFSoftmaxFFF(observation_space, action_space) # set optimizer opt = RMSpropAsync(lr=LEARNING_RATE , alpha=0.99 , eps=RMSPROP_EPS) opt.setup(shared_model) opt.add_hook(chainer.optimizer.GradientClipping(40)) writer = SummaryWriter('results/' + datetime.datetime.now().strftime('%B%d %H:%M:%S')) state = env.reset() state = chainer.Variable(np.expand_dims(np.array(state).astype(np.float32), axis=0)) pi, v = shared_model.get_pi_and_v(state) writer.add_graph([pi, v]) writer.close() async_train(env_name, shared_model, opt, phi) logger.info('END')
def main(): kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} CarSet = CarDataSet(ROOT, TRAIN, MASK) # split train val # train_idx, valid_idx = augmented_train_valid_split(CarSet, test_size = 0.15,shuffle = True ,random_seed=args.seed) # train_sampler = SubsetRandomSampler(train_idx) # val_samper = SubsetRandomSampler(valid_idx) train_loader = DataLoader(CarSet, # sampler=train_sampler, shuffle=True, batch_size=args.batch_size, **kwargs) # val_loader = DataLoader(CarSet, # sampler=val_samper, # batch_size=2, # **kwargs) model = uNet(NUM_CLASS) if args.cuda: model.cuda() optimizer=optim.Adam(model.parameters(),lr=args.lr,betas=(0.9, 0.999)) writer=SummaryWriter('logs/'+datetime.now().strftime('%B-%d')) best_loss=1e+5 iters=0 # resume training if args.resume: model,optimizer,args.start_epoch,best_loss,iters = resume(args.resume,model) for epoch in range(args.start_epoch ,args.epochs): adjust_lr(optimizer,epoch,decay=5) t1=time.time() loss, iters = train(epoch, model, optimizer, train_loader, writer, iters) is_best = loss < best_loss best_loss = min(best_loss, loss) state={ 'epoch':epoch, 'state_dict':model.state_dict(), 'optimizer':optimizer, 'loss':best_loss, 'iters': iters, } save_checkpoint(state, is_best) writer.close()
weights_file_path = os.path.join(snapshot_dir, weights_file_name) torch.save(model.state_dict(), weights_file_path) # TODO: Maybe delete the older snapshots? # # --------------------------------------------------------------------- # REDUCE THE LEARNING RATE IF APPROPRIATE # --------------------------------------------------------------------- scheduler.step(val_loss/n_minibatches_validation) # ------------------------------------------------------------------------- print('Finished Training!') writer.close() # Save the trained model print('Saving model...', end=' ') weights_file = ('./weights/spectrograms_weights_{}_{}.net'. format(distances, sample_size)) torch.save(model.state_dict(), weights_file) print('Done!') # # ------------------------------------------------------------------------- # MAKE PREDICTIONS ON THE TEST SET # ------------------------------------------------------------------------- print('Start making predictions on the test sample...', end=' ')
def train(): data_augmentation = DataAugmentationTransform_old(translation_range=(0.0, 0.15), rotation_range=10, zoom_range = (0.8, 1.0), flip_p = 0.5, brightness_range = (-0.2, 0.2), gamma_range = (0.5, 1.5), saturation_range=(-0.3, 0.3)) loader_train = CityscapesLoader(base_data_folder, split='train', is_transform=True, img_size=image_shape, transforms=None) trainloader = data.DataLoader(loader_train, batch_size=batch_size, num_workers=4, shuffle=True, pin_memory=True) if overlay_during_training: loader_test = CityscapesLoader(base_data_folder, split='test', is_transform=True, img_size=image_shape) test_loader = data.DataLoader(loader_test, batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=True) if check_validation: loader_val = CityscapesLoader(base_data_folder, split='val', is_transform=True, img_size=image_shape) valloader = data.DataLoader(loader_val, batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=True) model = get_model('fcn1s',num_classes) writer = SummaryWriter() if resume: print("Resuming From ",resume_filename) checkpoint = torch.load(resume_filename) model.load_state_dict(checkpoint['state_dict']) #starting_epoch = checkpoint['epoch'] #optimizer.load_state_dict(checkpoint['optimizer']) for param in model.parameters(): param.requires_grad = True if freeze_layers: print("Freezing VGG layers") for param in model.conv_block1.parameters(): param.requires_grad = False for param in model.conv_block2.parameters(): param.requires_grad = False for param in model.conv_block3.parameters(): param.requires_grad = False for param in model.conv_block4.parameters(): param.requires_grad = False for param in model.conv_block5.parameters(): param.requires_grad = False if torch.cuda.is_available(): print("Using GPU") model.cuda(0) else: print("Using CPU") model.train() parameters = filter(lambda p: p.requires_grad, model.parameters()) if opt == "SGD": optimizer = torch.optim.SGD(parameters, lr=l_rate, momentum=0.9, weight_decay=5e-4) elif opt =="Adam": optimizer = torch.optim.Adam(parameters, lr=l_rate, weight_decay=5e-4) best_metric = 0 old_file = "" for epoch in range(starting_epoch, epochs): train_acc = 0 train_IoU = 0 train_loss = 0 train_count = 0 print("\nEpoch: ",epoch) if overlay_during_training and epoch % 5 == 0: test_img = loader_test[67] test_img = test_img.unsqueeze(0) model.eval() test_pred = model(Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) overlay_images(test_img, test_pred, epoch, '67_') writer.add_graph(model, test_pred) del test_pred del test_img test_img = loader_test[88] test_img = test_img.unsqueeze(0) test_pred = model(Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) overlay_images(test_img, test_pred, epoch, '88_') del test_pred del test_img test_img = loader_test[175] test_img = test_img.unsqueeze(0) test_pred = model(Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) overlay_images(test_img, test_pred, epoch, '175_') del test_pred del test_img model.train() with tqdm.tqdm(trainloader, ncols=100) as t: for i, (images, labels) in enumerate(t): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i if poly_lr: poly_lr_scheduler(optimizer, l_rate, iter, lr_decay_iter=10) optimizer.zero_grad() outputs = model(images) loss = cross_entropy2d(outputs, labels, ignore_index=255) loss.backward() optimizer.step() #print("%8.2f %% -> Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r') t.set_description('Loss: %8.6f' % loss.data[0]) t.update(1) train_loss = train_loss + loss.data[0] acc, IoU = accuracy_IoU(outputs,labels, np.array(range(num_classes))) train_acc = train_acc + acc train_IoU = train_IoU + IoU.mean() train_count = train_count + 1 del outputs del loss del images del labels train_acc = train_acc / train_count train_IoU = train_IoU / train_count train_loss = train_loss / train_count print("\nTrain Accuracy: ", train_acc) print("Train Loss: ", train_loss) print("Train IoU: ", train_IoU, "\n") writer.add_scalar('Train Accuracy', train_acc, epoch) writer.add_scalar('Train IoU', train_IoU, epoch) writer.add_scalar('Train Los', train_loss, epoch) if check_validation: #VALIDATION!!! val_acc = 0 val_IoU = 0 val_loss = 0 val_count = 0 model.eval() for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i #poly_lr_scheduler(optimizer, l_rate, iter) outputs = model(images) loss = cross_entropy2d(outputs, labels, ignore_index=255) val_loss = val_loss + loss.data[0] acc, IoU = accuracy_IoU(outputs,labels, np.array(range(num_classes))) val_acc = val_acc + acc val_IoU = val_IoU + IoU.mean() val_count = val_count + 1 del outputs del loss del images del labels val_acc = val_acc / val_count val_IoU = val_IoU / val_count val_loss = val_loss / val_count print("\nVal Accuracy: ", val_acc) print("Val Loss: ", val_loss) print("Val IoU: ", val_IoU, "\n") writer.add_scalar('Val Accuracy', val_acc, epoch) writer.add_scalar('Val IoU', val_IoU, epoch) writer.add_scalar('Val Loss', val_loss, epoch) save_metric = val_IoU if check_validation: save_metric = val_IoU if best_metric < save_metric: best_metric = save_metric print("New Best IoU!") if save: torch.save({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") print("Model Saves As " + base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") if os.path.isfile(old_file): os.remove(old_file) old_file = base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar" print("Best IoU So Far: ", best_metric) writer.close() print("End Of Training")
def train(): loader_train = CityscapesLoader('/home/cattaneod/CITYSCAPES_crop/', split='train', is_transform=True, img_size=None, transforms=data_augmentation) trainloader = data.DataLoader(loader_train, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True) loader_test = CityscapesLoader(base_data_folder, split='test', is_transform=True, img_size=None, transforms=data_augmentation) test_loader = data.DataLoader(loader_test, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) loader_val = CityscapesLoader(base_data_folder, split='val', is_transform=True, img_size=image_shape, return_original=True) valloader = data.DataLoader(loader_val, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) model = deeplab_resnet_DUC.Res_Deeplab_DUC(num_classes) if TBWriter: writer = SummaryWriter() ''' if resume: print("Loading from: ", resume_filename) saved_state_dict = torch.load(resume_filename) if num_classes != 21: for i in saved_state_dict: # Scale.layer5.conv2d_list.3.weight i_parts = i.split('.') if i_parts[1] == 'layer5': saved_state_dict[i] = model.state_dict()[i] model.load_state_dict(saved_state_dict) ''' if torch.cuda.is_available(): print("Using GPU") model.cuda(0) else: print("Using CPU") model.train() if opt == "SGD": optimizer = torch.optim.SGD([{ 'params': get_1x_lr_params_NOscale(model), 'lr': l_rate }, { 'params': get_10x_lr_params(model), 'lr': 10 * l_rate }], lr=l_rate, momentum=0.9, weight_decay=5e-4) elif opt == "Adam": optimizer = torch.optim.Adam([{ 'params': get_1x_lr_params_NOscale(model), 'lr': 0 * l_rate }, { 'params': get_10x_lr_params(model), 'lr': 10 * l_rate }], lr=l_rate, weight_decay=5e-4) if resume: print("Resuming From ", resume_filename) checkpoint = torch.load(resume_filename) saved_state_dict = checkpoint['state_dict'] if reset_layer5: for i in model.state_dict(): # Scale.layer5.conv2d_list.3.weight i_parts = i.split('.') if i not in saved_state_dict or i_parts[1] == 'layer5': saved_state_dict[i] = model.state_dict()[i] model.load_state_dict(saved_state_dict) starting_epoch = checkpoint['epoch'] + 1 if poly_lr: lr_ = poly_lr2(l_rate, len(trainloader) * starting_epoch, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: if opt == "SGD": optimizer = torch.optim.SGD( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, momentum=0.9, weight_decay=5e-4) elif opt == "Adam": optimizer = torch.optim.Adam( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': 0 * lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, weight_decay=5e-4) best_metric = 0 old_file = "" train_acc = AverageMeter() train_IoU = AverageMeter() train_loss = AverageMeter() for epoch in range(starting_epoch, epochs): train_acc.reset() train_IoU.reset() train_loss.reset() train_cfmatrix = np.zeros((num_classes, num_classes)) print("\nEpoch: ", epoch) if overlay_during_training and epoch % 1 == 0: for i in range(15): print("Overlaying image ", i) names, original_img, test_img, _ = loader_val[i] test_img = test_img.unsqueeze(0) original_img = original_img.unsqueeze(0) original_img = Variable(original_img.cuda()) model.eval() test_pred = model( Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) #if TBWriter and i==0: # writer.add_graph(model, test_pred) test_pred = F.upsample_bilinear(test_pred, (1024, 2048)) overlay_images(names, original_img, test_pred, epoch, str(i) + '_', convert_id=False) del test_pred del test_img model.train() optimizer.zero_grad() with tqdm.tqdm(trainloader, ncols=150) as t: lr_ = l_rate for i, (images, labels) in enumerate(t): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i outputs = model(images) #g = make_dot(outputs) #g.save('./t.dot') loss = misc.cross_entropy2d(outputs, labels, ignore_index=255) loss = loss / update_batches loss.backward() t.set_description('Loss: %8.4f - LR = %f' % (update_batches * loss.data[0], lr_)) train_loss.update(update_batches * loss.data[0]) acc, IoU, cf_matrix = accuracy_IoU( outputs, labels, np.array(range(num_classes))) if acc is not None: train_acc.update(acc) train_IoU.update(np.nanmean(IoU)) train_cfmatrix = train_cfmatrix + cf_matrix if i % update_batches == 0: optimizer.step() if poly_lr: lr_ = poly_lr2(l_rate, iter, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: t.set_description( 'Step: %8.4f - LR = %f' % (update_batches * loss.data[0], lr_)) if opt == "SGD": optimizer = torch.optim.SGD( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, momentum=0.9, weight_decay=5e-4) elif opt == "Adam": optimizer = torch.optim.Adam( [{ 'params': get_1x_lr_params_NOscale(model), 'lr': 0 * lr_ }, { 'params': get_10x_lr_params(model), 'lr': 10 * lr_ }], lr=lr_, weight_decay=5e-4) #print("%8.2f %% -> Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r') optimizer.zero_grad() if i > 0 and i % TBUpdate == 0 and TBWriter: writer.add_scalar('Train Accuracy', train_acc.avg, iter) writer.add_scalar('Train IoU', train_IoU.avg, iter) writer.add_scalar('Train Loss', train_loss.avg, iter) del outputs del loss del images del labels t.update(1) rows = train_cfmatrix.sum(axis=1) cols = train_cfmatrix.sum(axis=0) IoU = np.ndarray(train_cfmatrix.shape[0]) for i in range(train_cfmatrix.shape[0]): if rows[i] + cols[i] > 0.: IoU[i] = train_cfmatrix[i][i] / (rows[i] + cols[i] - train_cfmatrix[i][i]) else: IoU[i] = np.nan print("\nTrain Accuracy: ", train_acc.avg) print("Train Loss: ", train_loss.avg) print("Micro IoU: ", train_IoU.avg, "\n") print("Macro IoU: ", np.nanmean(IoU), "\n") if check_validation: #VALIDATION!!! val_acc = AverageMeter() val_IoU = AverageMeter() val_loss = AverageMeter() val_cfmatrix = np.zeros((num_classes, num_classes)) model.eval() for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iter = len(trainloader) * epoch + i #poly_lr_scheduler(optimizer, l_rate, iter) outputs = model(images) loss = cross_entropy2d(outputs, labels, ignore_index=255) val_loss.update(loss.data[0]) acc, IoU, cf_matrix = accuracy_IoU( outputs, labels, np.array(range(num_classes))) if acc is not None: val_acc.update(acc) val_IoU.update(np.nanmean(IoU)) val_cfmatrix = val_cfmatrix + cf_matrix del outputs del loss del images del labels print("\nVal Accuracy: ", val_acc.avg) print("Val Loss: ", val_loss.avg) print("Val IoU: ", val_IoU.avg, "\n") if TBWriter: writer.add_scalar('Val Accuracy', val_acc.avg, epoch) writer.add_scalar('Val IoU', val_IoU.avg, epoch) writer.add_scalar('Val Loss', val_loss.avg, epoch) save_metric = train_IoU.avg if check_validation: save_metric = val_IoU.avg if best_metric < save_metric: best_metric = save_metric print("New Best IoU!") if save: torch.save( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") print("Model Saves As " + base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") if os.path.isfile(old_file): os.remove(old_file) old_file = base_save_folder + "/checkpoint_" + str( epoch) + "_" + str(save_metric) + ".pth.tar" print("Best IoU So Far: ", best_metric) if TBWriter: writer.close() print("End Of Training")
def main(): global args args = parser.parse_args() # Data preprocessing. print('==> Preparing data......') assert (args.dataset == 'cifar10' or args.dataset == 'cifar100'), "Only support cifar10 or cifar100 dataset" if args.dataset == 'cifar10': print('To train and eval on cifar10 dataset......') num_classes = 10 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean_cifar10, std_cifar10), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean_cifar10, std_cifar10), ]) train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4) test_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False, num_workers=4) else: print('To train and eval on cifar100 dataset......') num_classes = 100 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean_cifar100, std_cifar100), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean_cifar100, std_cifar100), ]) train_set = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4) test_set = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False, num_workers=4) # Model if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir( args.ckpt_path), 'Error: checkpoint directory not exists!' checkpoint = torch.load(os.path.join(args.ckpt_path, 'ckpt.t7')) model = checkpoint['model'] best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] else: print('==> Building model..') model = models.__dict__[args.arch](num_classes) start_epoch = args.start_epoch print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # Use GPUs if available. if torch.cuda.is_available(): model.cuda() model = torch.nn.DataParallel(model, device_ids=range( torch.cuda.device_count())) cudnn.benchmark = True # Define loss function and optimizer. criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) log_dir = 'logs/' + datetime.now().strftime('%B%d %H:%M:%S') train_writer = SummaryWriter(os.path.join(log_dir, 'train')) test_writer = SummaryWriter(os.path.join(log_dir, 'test')) # Save argparse commandline to a file. with open(os.path.join(log_dir, 'commandline_args.txt'), 'w') as f: f.write('\n'.join(sys.argv[1:])) best_acc = 0 # best test accuracy for epoch in range(start_epoch, args.epochs): # Learning rate schedule. lr = adjust_learning_rate(optimizer, epoch + 1) train_writer.add_scalar('lr', lr, epoch) # Train for one epoch. train(train_loader, model, criterion, optimizer, train_writer, epoch) # Eval on test set. num_iter = (epoch + 1) * len(train_loader) acc = eval(test_loader, model, criterion, test_writer, epoch, num_iter) # Save checkpoint. print('Saving Checkpoint......') state = { 'model': model.module if torch.cuda.is_available() else model, 'best_acc': best_acc, 'epoch': epoch, } if not os.path.isdir(os.path.join(log_dir, 'last_ckpt')): os.mkdir(os.path.join(log_dir, 'last_ckpt')) torch.save(state, os.path.join(log_dir, 'last_ckpt', 'ckpt.t7')) if acc > best_acc: best_acc = acc if not os.path.isdir(os.path.join(log_dir, 'best_ckpt')): os.mkdir(os.path.join(log_dir, 'best_ckpt')) torch.save(state, os.path.join(log_dir, 'best_ckpt', 'ckpt.t7')) train_writer.add_scalar('best_acc', best_acc, epoch) train_writer.close() test_writer.close()
def test_log_scalar_summary(): logdir = './experiment/scalar' writer = SummaryWriter(logdir) for i in range(10): writer.add_scalar('test_scalar', i+1) writer.close()
def DCGAN(epoch, noise_size, batch_size, save_period, dataset): if dataset == 'MNIST': '''location of tensorboard save file''' logdir = 'tensorboard/MNIST/' summary_writer = SummaryWriter(logdir) train_iter, train_data_number = Mnist_Data_Processing(batch_size) #all elif dataset == 'CIFAR10': '''location of tensorboard save file''' logdir = 'tensorboard/CIFAR10/' summary_writer = SummaryWriter(logdir) train_iter, train_data_number = Image_Data_Processing( batch_size, "CIFAR10") #class by class elif dataset == 'ImageNet': '''location of tensorboard save file''' logdir = 'tensorboard/IMAGENET/' summary_writer = SummaryWriter(logdir) train_iter, train_data_number = Image_Data_Processing( batch_size, "ImageNet") #face else: print "no input data!!!" # No need, but must be declared. label = mx.nd.zeros((batch_size, )) '''Network''' generator = Generator() discriminator = Discriminator() context = mx.gpu(0) '''In the code below, the 'inputs_need_grad' parameter in the 'mod.bind' function is very important.''' # =============module G============= modG = mx.mod.Module(symbol=generator, data_names=['noise'], label_names=None, context=context) modG.bind(data_shapes=[('noise', (batch_size, noise_size, 1, 1))], label_shapes=None, for_training=True) if dataset == 'MNIST': try: # load the saved modG data modG.load_params("MNIST_Weights/modG-10.params") except: pass if dataset == 'CIFAR10': try: # load the saved modG data modG.load_params("CIFAR10_Weights/modG-300.params") except: pass if dataset == 'ImageNet': try: #pass # load the saved modG data modG.load_params("ImageNet_Weights/modG-1000.params") except: pass modG.init_params(initializer=mx.initializer.Normal(sigma=0.02)) modG.init_optimizer(optimizer='adam', optimizer_params={ 'learning_rate': 0.0002, 'beta1': 0.5 }) # =============module discriminator[0],discriminator[1]============= modD_0 = mx.mod.Module(symbol=discriminator[0], data_names=['data'], label_names=None, context=context) modD_0.bind(data_shapes=train_iter.provide_data, label_shapes=None, for_training=True, inputs_need_grad=True) if dataset == 'MNIST': try: # load the saved modG data modD_0.load_params("MNIST_Weights/modD_0-10.params") except: pass if dataset == 'CIFAR10': try: # load the saved modG data modD_0.load_params("CIFAR10_Weights/modD_0-200.params") except: pass if dataset == 'ImageNet': #pass try: # load the saved modG data modD_0.load_params("ImageNet_Weights/modD_0-1000.params") except: pass modD_0.init_params(initializer=mx.initializer.Normal(sigma=0.02)) modD_0.init_optimizer(optimizer='adam', optimizer_params={ 'learning_rate': 0.0002, 'beta1': 0.5 }) """ Parameters shared_module : Module Default is `None`. This is used in bucketing. When not `None`, the shared module essentially corresponds to a different bucket -- a module with different symbol but with the same sets of parameters (e.g. unrolled RNNs with different lengths). In here, for sharing the Discriminator parameters, we must to use shared_module=modD_0 """ modD_1 = mx.mod.Module(symbol=discriminator[1], data_names=['data'], label_names=None, context=context) modD_1.bind(data_shapes=train_iter.provide_data, label_shapes=None, for_training=True, inputs_need_grad=True, shared_module=modD_0) # =============generate image============= column_size = 10 row_size = 10 test_mod = mx.mod.Module(symbol=generator, data_names=['noise'], label_names=None, context=context) test_mod.bind(data_shapes=[ mx.io.DataDesc(name='noise', shape=(column_size * row_size, noise_size, 1, 1)) ], label_shapes=None, shared_module=modG, for_training=False, grad_req='null') '''############Although not required, the following code should be declared.#################''' '''make evaluation method 1 - Using existing ones. metrics = { 'acc': Accuracy, 'accuracy': Accuracy, 'ce': CrossEntropy, 'f1': F1, 'mae': MAE, 'mse': MSE, 'rmse': RMSE, 'top_k_accuracy': TopKAccuracy }''' metric = mx.metric.create(['acc', 'mse']) '''make evaluation method 2 - Making new things.''' ''' Custom evaluation metric that takes a NDArray function. Parameters: •feval (callable(label, pred)) – Customized evaluation function. •name (str, optional) – The name of the metric. •allow_extra_outputs (bool) – If true, the prediction outputs can have extra outputs. This is useful in RNN, where the states are also produced in outputs for forwarding. ''' def zero(label, pred): return 0 null = mx.metric.CustomMetric(zero) ####################################training loop############################################ # =============train=============== for epoch in xrange(1, epoch + 1, 1): Max_cost_0 = 0 Max_cost_1 = 0 Min_cost = 0 total_batch_number = np.ceil(train_data_number / (batch_size * 1.0)) train_iter.reset() for batch in train_iter: noise = mx.random.uniform(low=-1.0, high=1.0, shape=(batch_size, noise_size, 1, 1), ctx=context) modG.forward(data_batch=mx.io.DataBatch(data=[noise], label=None), is_train=True) modG_output = modG.get_outputs() ################################updating only parameters related to modD.######################################## # update discriminator on noise data '''MAX : modD_1 : cost : (-mx.symbol.log(1-discriminator2)) - noise data Discriminator update , bigger and bigger -> smaller and smaller discriminator2''' modD_1.forward(data_batch=mx.io.DataBatch(data=modG_output, label=None), is_train=True) '''Max_Cost of noise data Discriminator''' Max_cost_1 += modD_1.get_outputs()[0].asnumpy().astype(np.float32) modD_1.backward() modD_1.update() # updating discriminator on real data '''MAX : modD_0 : cost: (-mx.symbol.log(discriminator2)) real data Discriminator update , bigger and bigger discriminator2''' modD_0.forward(data_batch=batch, is_train=True) '''Max_Cost of real data Discriminator''' Max_cost_0 += modD_0.get_outputs()[0].asnumpy().astype(np.float32) modD_0.backward() modD_0.update() ################################updating only parameters related to modG.######################################## # update generator on noise data '''MIN : modD_0 : cost : (-mx.symbol.log(discriminator2)) - noise data Discriminator update , bigger and bigger discriminator2''' modD_0.forward(data_batch=mx.io.DataBatch(data=modG_output, label=None), is_train=True) modD_0.backward() '''Max_Cost of noise data Generator''' Min_cost += modD_0.get_outputs()[0].asnumpy().astype(np.float32) diff_v = modD_0.get_input_grads() modG.backward(diff_v) modG.update() '''tensorboard part''' Max_C = ((Max_cost_0 + Max_cost_1) / total_batch_number * 1.0).mean() Min_C = (Min_cost / total_batch_number * 1.0).mean() arg_params, aux_params = modG.get_params() #write scalar values summary_writer.add_scalar(name="Max_cost", scalar_value=Max_C, global_step=epoch) summary_writer.add_scalar(name="Min_cost", scalar_value=Min_C, global_step=epoch) #write matrix values summary_writer.add_histogram( name="g1_weight", values=arg_params["g1_weight"].asnumpy().ravel()) summary_writer.add_histogram( name="g2_weight", values=arg_params["g2_weight"].asnumpy().ravel()) summary_writer.add_histogram( name="g3_weight", values=arg_params["g3_weight"].asnumpy().ravel()) summary_writer.add_histogram( name="g4_weight", values=arg_params["g4_weight"].asnumpy().ravel()) summary_writer.add_histogram( name="g5_weight", values=arg_params["g5_weight"].asnumpy().ravel()) # cost print print "epoch : {}".format(epoch) print "Max Discriminator Cost : {}".format(Max_C) print "Min Generator Cost : {}".format(Min_C) #Save the data if epoch % save_period == 0: # write image values generate_image = modG_output[0][0].asnumpy() # only one image generate_image = (generate_image + 1.0) * 127.5 ''' Args: tag: A name for the generated node. Will also serve as a series name in TensorBoard. tensor: A 3-D `uint8` or `float32` `Tensor` of shape `[height, width, channels]` where `channels` is 1, 3, or 4. ''' generate_image = generate_image.astype( np.uint8 ) # only dtype uint8 , Only this is done...- Should be improved. summary_writer.add_image( tag='generate_image_epoch_{}'.format(epoch), img_tensor=generate_image.transpose(1, 2, 0)) print('Saving weights') if dataset == "MNIST": modG.save_params("MNIST_Weights/modG-{}.params".format(epoch)) modD_0.save_params( "MNIST_Weights/modD_0-{}.params".format(epoch)) elif dataset == "CIFAR10": modG.save_params( "CIFAR10_Weights/modG-{}.params".format(epoch)) modD_0.save_params( "CIFAR10_Weights/modD_0-{}.params".format(epoch)) elif dataset == 'ImageNet': modG.save_params( "ImageNet_Weights/modG-{}.params".format(epoch)) modD_0.save_params( "ImageNet_Weights/modD_0-{}.params".format(epoch)) '''test_method-2''' test = mx.random.uniform(low=-1.0, high=1.0, shape=(column_size * row_size, noise_size, 1, 1), ctx=context) test_mod.forward( data_batch=mx.io.DataBatch(data=[test], label=None)) result = test_mod.get_outputs()[0] result = result.asnumpy() '''range adjustment -1 ~ 1 -> 0 ~ 2 -> 0 ~1 -> 0 ~ 255 ''' # result = np.clip((result + 1.0) * (255.0 / 2.0), 0, 255).astype(np.uint8) result = ((result + 1.0) * 127.5).astype(np.uint8) '''Convert the image size to 4 times''' result = np.asarray([[ cv2.resize(i, None, fx=2, fy=2, interpolation=cv2.INTER_AREA) for i in im ] for im in result]) result = result.transpose((0, 2, 3, 1)) '''visualization''' fig, ax = plt.subplots(row_size, column_size, figsize=(column_size, row_size)) fig.suptitle('generator') for j in xrange(row_size): for i in xrange(column_size): ax[j][i].set_axis_off() if dataset == "MNIST": ax[j][i].imshow(result[i + j * column_size], cmap='gray') elif dataset == "CIFAR10": ax[j][i].imshow(result[i + j * column_size]) elif dataset == 'ImageNet': ax[j][i].imshow(result[i + j * column_size]) if dataset == "MNIST": fig.savefig( "Generate_Image/DCGAN_MNIST_Epoch_{}.png".format(epoch)) elif dataset == "CIFAR10": fig.savefig( "Generate_Image/DCGAN_CIFAR10_Epoch_{}.png".format(epoch)) elif dataset == 'ImageNet': fig.savefig( "Generate_Image/DCGAN_ImageNet_Epoch_{}.png".format(epoch)) plt.close(fig) print "Optimization complete." '''tensorboard_part''' summary_writer.close() #################################Generating Image#################################### '''load method1 - load the training mod.get_params() directly''' #arg_params, aux_params = mod.get_params() '''Annotate only when running test data. and Uncomment only if it is 'load method2' ''' #test_mod.set_params(arg_params=arg_params, aux_params=aux_params) '''test_method-1''' ''' noise = noise_iter.next() test_mod.forward(noise, is_train=False) result = test_mod.get_outputs()[0] result = result.asnumpy() print np.shape(result) ''' '''load method2 - using the shared_module''' """ Parameters shared_module : Module Default is `None`. This is used in bucketing. When not `None`, the shared module essentially corresponds to a different bucket -- a module with different symbol but with the same sets of parameters (e.g. unrolled RNNs with different lengths). """ '''test_method-2''' test = mx.random.uniform(low=-1.0, high=1.0, shape=(column_size * row_size, noise_size, 1, 1), ctx=context) test_mod.forward(data_batch=mx.io.DataBatch(data=[test], label=None)) result = test_mod.get_outputs()[0] result = result.asnumpy() '''range adjustment -1 ~ 1 -> 0 ~ 2 -> 0 ~1 -> 0 ~ 255 ''' #result = np.clip((result + 1.0) * (255.0 / 2.0), 0, 255).astype(np.uint8) result = ((result + 1.0) * 127.5).astype(np.uint8) '''Convert the image size to 4 times''' result = np.asarray([[ cv2.resize(i, None, fx=2, fy=2, interpolation=cv2.INTER_AREA) for i in im ] for im in result]) result = result.transpose((0, 2, 3, 1)) '''visualization''' fig, ax = plt.subplots(row_size, column_size, figsize=(column_size, row_size)) fig.suptitle('generator') for j in xrange(row_size): for i in xrange(column_size): ax[j][i].set_axis_off() if dataset == "MNIST": ax[j][i].imshow(result[i + j * column_size], cmap='gray') elif dataset == "CIFAR10": ax[j][i].imshow(result[i + j * column_size]) elif dataset == 'ImageNet': ax[j][i].imshow(result[i + j * column_size]) if dataset == "MNIST": fig.savefig("Generate_Image/DCGAN_MNIST_Final.png") elif dataset == "CIFAR10": fig.savefig("Generate_Image/DCGAN_CIFAR10_Final.png") elif dataset == 'ImageNet': fig.savefig("Generate_Image/DCGAN_ImageNet_Final.png") plt.show(fig)
def train(): if use_weights: weight = torch.ones(num_classes) ''' #The following wheigts are taken from https://github.com/Eromera/erfnet_pytorch/blob/master/train/main.py weight[0] = 2.8149201869965 weight[1] = 6.9850029945374 weight[2] = 3.7890393733978 weight[3] = 9.9428062438965 weight[4] = 9.7702074050903 weight[5] = 9.5110931396484 weight[6] = 10.311357498169 weight[7] = 10.026463508606 weight[8] = 4.6323022842407 weight[9] = 9.5608062744141 weight[10] = 7.8698215484619 weight[11] = 9.5168733596802 weight[12] = 10.373730659485 weight[13] = 6.6616044044495 weight[14] = 10.260489463806 weight[15] = 10.287888526917 weight[16] = 10.289801597595 weight[17] = 10.405355453491 weight[18] = 10.138095855713 ''' #The following weights are calculated using calculate_weights.py (hist.median() / hist) weight[0] = 0.0238 weight[1] = 0.1540 weight[2] = 0.0447 weight[3] = 1.3481 weight[4] = 1.0000 weight[5] = 0.7090 weight[6] = 4.6042 weight[7] = 1.6716 weight[8] = 0.0622 weight[9] = 0.7796 weight[10] = 0.3195 weight[11] = 0.6157 weight[12] = 5.2630 weight[13] = 0.1177 weight[14] = 3.0565 weight[15] = 3.2344 weight[16] = 3.4215 weight[17] = 8.1690 weight[18] = 1.9417 else: weight = None loader_train = CityscapesLoader2(base_data_folder, split='train', img_size=None, transforms=data_augmentation_train) trainloader = data.DataLoader(loader_train, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True) #loader_test = CityscapesLoader2(base_data_folder, split='test', is_transform=True, img_size=None, transforms=data_augmentation) #test_loader = data.DataLoader(loader_test, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) loader_val = CityscapesLoader2(base_data_folder, split='val', img_size=None, transforms=data_augmentation_val) valloader = data.DataLoader(loader_val, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) model = psp_net.PSPNet(num_classes) if TBWriter: writer = SummaryWriter('./runs/PSP2/') ''' if resume: print("Loading from: ", resume_filename) saved_state_dict = torch.load(resume_filename) if num_classes != 21: for i in saved_state_dict: # Scale.layer5.conv2d_list.3.weight i_parts = i.split('.') if i_parts[1] == 'layer5': saved_state_dict[i] = model.state_dict()[i] model.load_state_dict(saved_state_dict) ''' if torch.cuda.is_available(): print("Using GPU") model.cuda(0) if use_weights: weight = weight.cuda() else: print("Using CPU") model.train() if opt == "SGD": optimizer = torch.optim.SGD([{ 'params': [ param for name, param in model.named_parameters() if name[-4:] == 'bias' ], 'lr': 2 * l_rate }, { 'params': [ param for name, param in model.named_parameters() if name[-4:] != 'bias' ], 'lr': l_rate, 'weight_decay': weight_decay }], momentum=0.9) elif opt == "Adam": optimizer = torch.optim.Adam([{ 'params': [ param for name, param in model.named_parameters() if name[-4:] == 'bias' ], 'lr': 2 * l_rate }, { 'params': [ param for name, param in model.named_parameters() if name[-4:] != 'bias' ], 'lr': l_rate, 'weight_decay': weight_decay }]) if resume: print("Resuming From ", resume_filename) checkpoint = torch.load(resume_filename) saved_state_dict = checkpoint['state_dict'] starting_epoch = checkpoint['epoch'] starting_iteration = int(checkpoint['iter'] % 35700 / batch_size) print("Startin epoch: " + str(starting_epoch) + ", starting iter: ", str(starting_iteration)) if poly_lr: lr_ = poly_lr2(l_rate, starting_iteration + len(trainloader) * starting_epoch, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: optimizer.param_groups[0]['lr'] = 2 * lr_ optimizer.param_groups[1]['lr'] = lr_ model.load_state_dict(saved_state_dict) best_metric = 0 old_file = "" old_checkpoint = "" train_acc = AverageMeter() train_IoU = AverageMeter() train_loss = AverageMeter() local_acc = AverageMeter(moving_average=moving_average) local_IoU = AverageMeter(moving_average=moving_average) local_loss = AverageMeter(moving_average=moving_average) for epoch in range(starting_epoch, epochs): train_acc.reset() train_IoU.reset() train_loss.reset() train_cfmatrix = np.zeros((num_classes, num_classes)) print("\nEpoch: ", epoch) if overlay_during_training and epoch % 1 == 0: for i in range(15): print("Overlaying image ", i) test_img, _ = loader_val[i] test_img = test_img.unsqueeze(0) #original_img = original_img.unsqueeze(0) #original_img = Variable(original_img.cuda()) model.eval() test_pred = model( Variable(test_img.cuda(0), requires_grad=True)) test_img = Variable(test_img.cuda(0), requires_grad=True) #if TBWriter and i==0: # writer.add_graph(model, test_pred) #test_pred = F.upsample_bilinear(test_pred, (1024, 2048)) overlay_images('', test_img, test_pred, epoch, str(i) + '_', convert_id=False) del test_pred del test_img model.train() optimizer.zero_grad() with tqdm.tqdm(trainloader, ncols=150) as t: if epoch == starting_epoch: t.update(starting_iteration) for i, (images, labels) in enumerate(t): if torch.cuda.is_available(): images = Variable(images).cuda(0) labels = Variable(labels).cuda(0) else: images = Variable(images) labels = Variable(labels) iteration = len(trainloader) * epoch + i processed_image = i * batch_size if epoch == starting_epoch: iteration += starting_iteration processed_image += starting_iteration * batch_size outputs, aux = model(images) #g = make_dot(outputs) #g.save('./t.dot') main_loss = misc.cross_entropy2d(outputs, labels, weight=weight, ignore_index=255) aux_loss = misc.cross_entropy2d(aux, labels, ignore_index=255) loss = main_loss + 0.4 * aux_loss loss = loss / update_batches loss.backward() t.set_description('Loss: %8.4f - LR = %f' % (update_batches * loss.data[0], lr_)) train_loss.update(update_batches * loss.data[0]) local_loss.update(update_batches * loss.data[0]) acc, IoU, cf_matrix = accuracy_IoU( outputs, labels, np.array(range(num_classes))) if acc is not None: train_acc.update(acc) train_IoU.update(np.nanmean(IoU)) local_acc.update(acc) local_IoU.update(np.nanmean(IoU)) train_cfmatrix = train_cfmatrix + cf_matrix if i % update_batches == 0: optimizer.step() if poly_lr: lr_ = poly_lr2(l_rate, iteration, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: t.set_description( 'Step: %8.4f - LR = %f' % (update_batches * loss.data[0], lr_)) optimizer.param_groups[0]['lr'] = 2 * lr_ optimizer.param_groups[1]['lr'] = lr_ #print("%8.2f %% -> Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r') optimizer.zero_grad() if local_acc.count > 500 and processed_image % TBUpdate == 0 and TBWriter: writer.add_scalar('Train Accuracy', local_acc.avg, iteration * batch_size) writer.add_scalar('Train IoU', local_IoU.avg, iteration * batch_size) writer.add_scalar('Train Loss', local_loss.avg, iteration * batch_size) del outputs del loss del images del labels if i > 0 and local_acc.count > 500 and processed_image % checkpoint_save == 0: save_name = base_save_folder + "/checkpoint_" + str( epoch) + "_" + str(processed_image) + "_" + str( local_IoU.avg) + ".pth.tar" torch.save( { 'epoch': epoch, 'iter': processed_image, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, save_name) print("Model Saved As " + save_name) if os.path.isfile(old_checkpoint): os.remove(old_checkpoint) old_checkpoint = save_name t.update(1) if i + starting_iteration + 1 == len(trainloader): break rows = train_cfmatrix.sum(axis=1) cols = train_cfmatrix.sum(axis=0) IoU = np.ndarray(train_cfmatrix.shape[0]) for i in range(train_cfmatrix.shape[0]): if rows[i] + cols[i] > 0.: IoU[i] = train_cfmatrix[i][i] / (rows[i] + cols[i] - train_cfmatrix[i][i]) else: IoU[i] = np.nan print("\nMicro Accuracy: ", train_acc.avg) print("Macro Accuracy: ", np.trace(train_cfmatrix) / np.sum(train_cfmatrix)) print("Micro IoU: ", train_IoU.avg, "\n") print("Macro IoU: ", np.nanmean(IoU), "\n") print("Train Loss: ", train_loss.avg) if check_validation: val_IoU = eval(model) ''' #VALIDATION!!! val_acc = AverageMeter() val_IoU = AverageMeter() val_loss = AverageMeter() val_cfmatrix = np.zeros((num_classes, num_classes)) model.eval() for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) iteration = len(trainloader) * epoch + i #poly_lr_scheduler(optimizer, l_rate, iter) outputs = model(images) loss = cross_entropy2d(outputs, labels, ignore_index=255) val_loss.update(loss.data[0]) acc, IoU, cf_matrix = accuracy_IoU(outputs,labels, np.array(range(num_classes))) if acc is not None: val_acc.update(acc) val_IoU .update(np.nanmean(IoU)) val_cfmatrix = val_cfmatrix + cf_matrix del outputs del loss del images del labels print("\nVal Accuracy: ", val_acc.avg) print("Val Loss: ", val_loss.avg) print("Val IoU: ", val_IoU.avg, "\n") if TBWriter: writer.add_scalar('Val Accuracy', val_acc.avg, epoch) writer.add_scalar('Val IoU', val_IoU.avg, epoch) writer.add_scalar('Val Loss', val_loss.avg, epoch) ''' save_metric = np.nanmean(IoU) if check_validation and doIouOrig: save_metric = val_IoU if best_metric < save_metric: best_metric = save_metric print("New Best IoU!") if save: torch.save( { 'epoch': epoch + 1, 'iter': 0, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") print("Model Saved As " + base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") if os.path.isfile(old_file): os.remove(old_file) old_file = base_save_folder + "/checkpoint_" + str( epoch) + "_" + str(save_metric) + ".pth.tar" print("Best IoU So Far: ", best_metric) if TBWriter: writer.close() print("End Of Training")
def train(): weight = torch.ones(num_classes) if (enc): weight[0] = 2.3653597831726 weight[1] = 4.4237880706787 weight[2] = 2.9691488742828 weight[3] = 5.3442072868347 weight[4] = 5.2983593940735 weight[5] = 5.2275490760803 weight[6] = 5.4394111633301 weight[7] = 5.3659925460815 weight[8] = 3.4170460700989 weight[9] = 5.2414722442627 weight[10] = 4.7376127243042 weight[11] = 5.2286224365234 weight[12] = 5.455126285553 weight[13] = 4.3019247055054 weight[14] = 5.4264230728149 weight[15] = 5.4331531524658 weight[16] = 5.433765411377 weight[17] = 5.4631009101868 weight[18] = 5.3947434425354 else: weight[0] = 2.8149201869965 weight[1] = 6.9850029945374 weight[2] = 3.7890393733978 weight[3] = 9.9428062438965 weight[4] = 9.7702074050903 weight[5] = 9.5110931396484 weight[6] = 10.311357498169 weight[7] = 10.026463508606 weight[8] = 4.6323022842407 weight[9] = 9.5608062744141 weight[10] = 7.8698215484619 weight[11] = 9.5168733596802 weight[12] = 10.373730659485 weight[13] = 6.6616044044495 weight[14] = 10.260489463806 weight[15] = 10.287888526917 weight[16] = 10.289801597595 weight[17] = 10.405355453491 weight[18] = 10.138095855713 #weight[19] = 0 loader_train = CityscapesLoader2(base_data_folder, split='train',img_size=image_shape, transforms=data_augmentation_train) trainloader = data.DataLoader(loader_train, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True) #loader_test = CityscapesLoader2(base_data_folder, split='test', is_transform=True, img_size=None, transforms=data_augmentation) #test_loader = data.DataLoader(loader_test, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) loader_val = CityscapesLoader2(base_data_folder, split='val', img_size=image_shape, transforms=data_augmentation_val) valloader = data.DataLoader(loader_val, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True) model = erfnet.ERFNet(num_classes) if TBWriter: writer = SummaryWriter('./runs/ERF_Fine/') ''' if resume: print("Loading from: ", resume_filename) saved_state_dict = torch.load(resume_filename) if num_classes != 21: for i in saved_state_dict: # Scale.layer5.conv2d_list.3.weight i_parts = i.split('.') if i_parts[1] == 'layer5': saved_state_dict[i] = model.state_dict()[i] model.load_state_dict(saved_state_dict) ''' if opt == "SGD": optimizer = torch.optim.SGD(model.parameters(), l_rate, momentum=0.9, weight_decay=weight_decay) elif opt =="Adam": optimizer = torch.optim.Adam(model.parameters(), l_rate, (0.9, 0.999), eps=1e-08, weight_decay=weight_decay) starting_epoch = 0 starting_iteration = 0 lr_ = l_rate if resume: print("Loading Encoder Weights from: ", encoder_weights) checkpoint = torch.load(encoder_weights) saved_state_dict = checkpoint['state_dict'] print("Startin epoch: "+str(starting_epoch)+", starting iter: ",str(starting_iteration)) if poly_lr: lr_ = poly_lr2(l_rate, starting_iteration + len(trainloader) * starting_epoch, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: optimizer.param_groups[0]['lr'] = lr_ if enc: model.load_state_dict(saved_state_dict) else: pretrained_enc = erfnet.ERFNet(num_classes) pretrained_enc.load_state_dict(saved_state_dict) pretrained_enc = pretrained_enc.encoder if freeze_encoder: pretrained_enc.eval() for param in pretrained_enc.parameters(): param.requires_grad = False decoder = model.decoder if decoder_weights is not None: print("Loading Decoder Weights from: ", decoder_weights) checkpoint = torch.load(decoder_weights) saved_state_dict = checkpoint['state_dict'] #starting_epoch = checkpoint['epoch'] #starting_iteration = int(checkpoint['iter'] % 2975 / batch_size) decoder.load_state_dict(saved_state_dict) model.encoder = pretrained_enc model.decoder = decoder if torch.cuda.is_available(): print("Using GPU") model.cuda(0) if use_weights: weight = weight.cuda() else: print("Using CPU") model.train() best_metric = 0 old_file = "" old_checkpoint = "" train_acc = AverageMeter() train_IoU = AverageMeter() train_loss = AverageMeter() local_acc = AverageMeter(moving_average=moving_average) local_IoU = AverageMeter(moving_average=moving_average) local_loss = AverageMeter(moving_average=moving_average) mean_time = AverageMeter() for epoch in range(starting_epoch, epochs): train_acc.reset() train_IoU.reset() train_loss.reset() mean_time.reset() train_cfmatrix = np.zeros((num_classes, num_classes)) print(colors.YELLOW+"========== EPOCH:"+str(epoch)+ " ==========" + colors.ENDC) model.train() optimizer.zero_grad() with tqdm.tqdm(trainloader, ncols=150) as t: if epoch == starting_epoch: t.update(starting_iteration) for i, (images, labels) in enumerate(t): start_time = time.time() if torch.cuda.is_available(): images = Variable(images).cuda(0) labels = Variable(labels).cuda(0) else: images = Variable(images) labels = Variable(labels) iteration = len(trainloader) * epoch + i processed_image = i * batch_size if epoch == starting_epoch: iteration += starting_iteration processed_image += starting_iteration * batch_size if enc: outputs = model(images, only_encode=True) else: outputs = model(images, only_encode=False) #g = make_dot(outputs) #g.save('./t.dot') main_loss = misc.cross_entropy2d(outputs, labels,weight=weight, ignore_index=255) loss = main_loss loss = loss / update_batches loss.backward() mean_time.update(time.time() - start_time) t.set_description('Loss: %8.4f - Time: %8.4f - LR: %8.6f' % (update_batches * loss.data[0], mean_time.avg / batch_size, lr_)) train_loss.update(update_batches * loss.data[0]) local_loss.update(update_batches * loss.data[0]) if doTrainStats: if doCFMatrixTrain: acc, IoU, cf_matrix = accuracy_IoU_CFMatrix(outputs,labels, np.array(range(num_classes))) IoU = IoU.mean else: acc, IoU = accuracy_IoU(outputs,labels, np.array(range(num_classes))) if acc is not None: train_acc.update(acc) train_IoU.update(IoU) local_acc.update(acc) local_IoU.update(IoU) if doCFMatrixTrain: train_cfmatrix = train_cfmatrix + cf_matrix if i % update_batches == 0: optimizer.step() if poly_lr: lr_ = poly_lr2(l_rate, iteration, lr_decay_iter=1, max_iter=len(trainloader) * epochs) if lr_: t.set_description('Step: %8.4f - Time: %8.4f - LR: %8.6f' % ( update_batches * loss.data[0], mean_time.avg / batch_size, lr_)) optimizer.param_groups[0]['lr'] = lr_ #print("%8.2f %% -> Loss: %8.6f " % (i / len(trainloader) * 100, loss.data[0]), end='\r') optimizer.zero_grad() if local_loss.count > int(500 / batch_size) and processed_image % TBUpdate == 0 and TBWriter: writer.add_scalar('Train Loss', local_loss.avg, iteration * batch_size) if doTrainStats: writer.add_scalar('Train Accuracy', local_acc.avg, iteration * batch_size) writer.add_scalar('Train IoU', local_IoU.avg, iteration * batch_size) del outputs del loss del images del labels if i>0 and local_loss.count > int(500 / batch_size) and processed_image % checkpoint_save == 0: save_name = base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(processed_image) + "_" + str(local_loss.avg) + ".pth.tar" torch.save({ 'epoch': epoch, 'iter' : processed_image, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, save_name) print("\nModel Saved As " + save_name) if os.path.isfile(old_checkpoint): os.remove(old_checkpoint) old_checkpoint = save_name t.update(1) if doCFMatrixTrain and doTrainStats: rows = train_cfmatrix.sum(axis=1) cols = train_cfmatrix.sum(axis=0) IoU = np.ndarray(train_cfmatrix.shape[0]) for i in range(train_cfmatrix.shape[0]): if rows[i] + cols[i] > 0.: IoU[i] = train_cfmatrix[i][i] / (rows[i] + cols[i] - train_cfmatrix[i][i]) else: IoU[i] = np.nan print("Macro IoU: ", np.nanmean(IoU), "\n") print("Macro Accuracy: ", np.trace(train_cfmatrix) / np.sum(train_cfmatrix)) if doTrainStats: print("\nMicro Accuracy: ", train_acc.avg) print("Micro IoU: ", train_IoU.avg, "\n") print("Train Loss: ", train_loss.avg) if check_validation: val_IoU = eval(model) if TBWriter: writer.add_scalar('Val IoU', val_IoU, epoch) if doCFMatrixTrain: save_metric = np.nanmean(IoU) elif doTrainStats: save_metric = train_IoU.avg else: save_metric = train_loss.avg if check_validation and doIouOrig: save_metric = val_IoU if best_metric < save_metric: best_metric = save_metric print("New Best IoU!") if save: torch.save({ 'epoch': epoch + 1, 'iter' : 0, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") print("Model Saved As " + base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar") if os.path.isfile(old_file): os.remove(old_file) old_file = base_save_folder + "/checkpoint_" + str(epoch) + "_" + str(save_metric) + ".pth.tar" print("Best IoU So Far: ", best_metric) if TBWriter: writer.close() print("End Of Training")