def main(args): start_time = time.time() model = utils.build_model(args) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) criterion = torch.nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.milestone, gamma=0.1, last_epoch=-1) train_loader, test_loader = datasets.dataloader_builder(args) train_loss = np.zeros(args.epochs) test_loss, test_accu = np.zeros(args.epochs), np.zeros(args.epochs) print('\n\r\t#### Start Training ####') for epoch in range(args.epochs): train_loss[epoch] = trainer(model, train_loader, optimizer, criterion) test_loss[epoch], test_accu[epoch] = tester(model, test_loader) scheduler.step() # print(scheduler.get_lr()[0]) print( '| Epoch: {0:3d} | Training Loss: {1:.6f} | Test Accuracy: {2:.2f} | Test Loss {3:.6f} |' .format(epoch, train_loss[epoch], test_accu[epoch], test_loss[epoch])) print('\t#### Time Consumed: {0:.3f} second ####\n\r'.format(time.time() - start_time)) utils.saveCheckpoint(args.cp_dir, args.model_name, epoch, model, optimizer, test_accu, train_loss, args.bn, args.weight_decay) utils.plotCurve(args, train_loss / args.trn_batch, test_loss, test_accu)
def pretrain(args, dataloaders, model, criterion, optimizer): print('\n Pretrain...\n') # Initialize Variables dataloader_HAND3D_train = dataloaders['HAND3D']['train'] dataloader_HAND3D_valid = dataloaders['HAND3D']['valid'] # dataloader_STEREO_valid = dataloaders['STEREO']['valid'] loss_valid_best = 1000.0 loss_valid_delta = 0.0 # Pretrain the model for epoch in range(args.max_epochs_pretrain): # Initialize learning rate learning_rate = adjustLR(optimizer, epoch, args.lr_base_pretrain, policy=args.lr_policy_pretrain, policy_parameter=args.lr_policy_param_pretrain) # Intialize variables metrics = {'loss': [], 'loss_list': {'loss_2d': [], 'loss_3d': [], 'loss_mask': [], 'loss_reg': [], 'loss_camera': [], 'avg_distance_2d': [list() for _ in range(args.n_kps)], 'avg_distance_3d': [list() for _ in range(args.n_kps)]}} for i, (data) in enumerate(dataloader_HAND3D_train): # Set CUDA image, mask, targets, index = setCUDA(args, data) # Initialize optimizer optimizer.zero_grad() # Get camera_parameters predictions = model(image, right=True) # Get loss loss, loss_list = criterion(epoch, mask, predictions, targets) # Optimize the model loss.backward() optimizer.step() # Keep track of metrics metrics['loss'].append(loss.item()) metrics['loss_list'] = convertLossList(metrics['loss_list'], loss_list) # Print log if (i+1) % 50 == 0: saveLog(args, epoch, args.max_epochs_pretrain, i, dataloader_HAND3D_train, learning_rate, loss, metrics, mode='Pretr') # Validation loss_HAND3D_valid = valid(args, epoch, args.max_epochs_pretrain, learning_rate, dataloader_HAND3D_valid, model, criterion, mode='Pretr', display_2D=True, display_3D=False) # loss_STEREO_valid = valid(args, epoch, args.max_epochs_pretrain, learning_rate, dataloader_STEREO_valid, model, criterion, mode='Pretr', display_2D=True, display_3D=False) # Save the model checkpoints if (epoch+1) % args.interval_checkpoint: saveCheckpoint(args, model, optimizer, pretrain=True) # Save the best model if loss_HAND3D_valid < (loss_valid_best - loss_valid_delta): loss_valid_best = loss_HAND3D_valid saveCheckpointBestModel(args, model, optimizer, pretrain=True)
def continuous_frame_recognition(): """ Using RNN network to recognize the action. """ start_epoch = 1 # ----------------------------------------------------- # Create Model, optimizer, scheduler, and loss function # ----------------------------------------------------- # extractor = resnet50(pretrained=True).to(DEVICE) recurrent = LSTM_Net(2048, opt.hidden_dim, opt.output_dim, num_layers=opt.layers, bias=True, batch_first=False, dropout=opt.dropout, bidirectional=opt.bidirection, seq_predict=False).to(DEVICE) # ---------------------------------------------- # For signal direction LSTM # weight_ih_l0 torch.Size([512, 2048]) # weight_hh_l0 torch.Size([512, 128]) # bias_ih_l0 torch.Size([512]) # bias_hh_l0 torch.Size([512]) # # For bidirectional LSTM, reverse layer is added. # weight_ih_l0_reverse torch.Size([512, 2048]) # weight_hh_l0_reverse torch.Size([512, 128]) # bias_ih_l0_reverse torch.Size([512]) # bias_hh_l0_reverse torch.Size([512]) # ---------------------------------------------- # Weight_init if "orthogonal" in opt.weight_init: for layer, param in recurrent.recurrent.named_parameters(): print("{} {}".format(layer, param.shape)) if len(param.shape) >= 2: nn.init.orthogonal_(param) # Bias_init if "forget_bias_0" in opt.bias_init: for layer, param in recurrent.recurrent.named_parameters(): if layer.startswith("bias"): size = param.shape[0] start = int(size * 0.25) end = int(size * 0.5) param[start:end].data.fill_(0) if "forget_bias_1" in opt.bias_init: for layer, param in recurrent.recurrent.named_parameters(): if layer.startswith("bias"): size = param.shape[0] start = int(size * 0.25) end = int(size * 0.5) param[start:end].data.fill_(1) # Set optimizer if opt.optimizer == "Adam": optimizer = optim.Adam(recurrent.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), weight_decay=opt.weight_decay) elif opt.optimizer == "SGD": optimizer = optim.SGD(recurrent.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) elif opt.optimizer == "ASGD": optimizer = optim.ASGD(recurrent.parameters(), lr=opt.lr, lambd=1e-4, alpha=0.75, t0=1000000.0, weight_decay=opt.weight_decay) elif opt.optimizer == "Adadelta": optimizer = optim.Adadelta(recurrent.parameters(), lr=opt.lr, rho=0.9, eps=1e-06, weight_decay=opt.weight_decay) elif opt.optimizer == "Adagrad": optimizer = optim.Adagrad(recurrent.parameters(), lr=opt.lr, lr_decay=0, weight_decay=opt.weight_decay, initial_accumulator_value=0) elif opt.optimizer == "SparseAdam": optimizer = optim.SparseAdam(recurrent.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), eps=1e-08) elif opt.optimizer == "Adamax": optimizer = optim.Adamax(recurrent.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), eps=1e-08, weight_decay=opt.weight_dacay) else: raise argparse.ArgumentError scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=opt.gamma) # Load parameter if opt.pretrain: recurrent = utils.loadModel(opt.pretrain, recurrent) if opt.resume: recurrent, optimizer, start_epoch, scheduler = utils.loadCheckpoint( opt.resume, recurrent, optimizer, scheduler) # Set criterion criterion = nn.CrossEntropyLoss().to(DEVICE) # Set dataloader transform = transforms.ToTensor() trainlabel = os.path.join(opt.train, "label", "gt_train.csv") trainfeature = os.path.join(opt.train, "feature", "train") vallabel = os.path.join(opt.val, "label", "gt_valid.csv") valfeature = os.path.join(opt.val, "feature", "valid") train_set = dataset.TrimmedVideos(None, trainlabel, trainfeature, downsample=opt.downsample, transform=transform) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, collate_fn=utils.collate_fn, num_workers=opt.threads) # Show the memory used by neural network print("The neural network allocated GPU with {:.1f} MB".format( torch.cuda.memory_allocated() / 1024 / 1024)) #------------------ # Train the models #------------------ trainloss = [] trainaccs = [] valloss = [] valaccs = [] epochs = [] for epoch in range(start_epoch, opt.epochs + 1): scheduler.step() # Save the train loss and train accuracy max_trainaccs = max(trainaccs) if len(trainaccs) else 0 min_trainloss = min(trainloss) if len(trainloss) else 0 recurrent, loss, acc = train(recurrent, train_loader, optimizer, epoch, criterion, max_trainaccs, min_trainloss) trainloss.append(loss) trainaccs.append(acc) # validate the model with several downsample ratio loss_list, acc_list, label_list = [], [], [] for downsample in [1, 2, 4, 6, 12]: val_set = dataset.TrimmedVideos(None, vallabel, valfeature, downsample=downsample, transform=transform) val_loader = DataLoader(val_set, batch_size=1, shuffle=True, collate_fn=utils.collate_fn, num_workers=opt.threads) print("[Epoch {}] [Validation] [Downsample: {:2d}]".format( epoch, downsample)) acc, loss = val(recurrent, val_loader, epoch, criterion) loss_list.append(loss) acc_list.append(acc) label_list.append('val_{}'.format(downsample)) valloss.append(loss_list) valaccs.append(acc_list) # Save the epochs epochs.append(epoch) # with open(os.path.join(opt.log, "problem_2", opt.tag, 'statistics.txt'), 'w') as textfile: # textfile.write("\n".join(map(lambda x: str(x), (trainloss, trainaccs, valloss, valaccs, epochs)))) records = list( map(lambda x: np.array(x), (trainloss, trainaccs, valloss, valaccs, epochs))) for record, name in zip(records, ('trainloss.txt', 'trainaccs.txt', 'valloss.txt', 'valaccs.txt', 'epochs.txt')): np.savetxt(os.path.join(opt.log, "problem_2", opt.tag, name), record) if epoch % opt.save_interval == 0: savepath = os.path.join(opt.checkpoints, "problem_2", opt.tag, str(epoch) + '.pth') utils.saveCheckpoint(savepath, recurrent, optimizer, scheduler, epoch) # Draw the accuracy / loss curve draw_graphs(trainloss, valloss, trainaccs, valaccs, epochs, "problem_2", label_list) return recurrent
def main(opt): """ Main process of train.py Parameters ---------- opt : namespace The option (hyperparameters) of these model """ if opt.fixrandomseed: seed = 1334 torch.manual_seed(seed) if opt.cuda: torch.cuda.manual_seed(seed) print("==========> Loading datasets") img_transform = Compose([ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) if opt.normalize else ToTensor() # Dataset train_loader, val_loader = getDataset(opt, img_transform) # TODO: Parameters Selection # TODO: Mean shift Layer Handling # Load Model print("==========> Building model") model = ImproveNet(opt.rb) # ----------------------------------------------- # # Loss: L1 Norm / L2 Norm # # Perceptual Model (Optional) # # TODO Append Layer (Optional) # # ----------------------------------------------- # criterion = nn.MSELoss(reduction='mean') perceptual = None if (opt.perceptual is None) else getPerceptualModel(opt.perceptual).eval() # ----------------------------------------------- # # Optimizer and learning rate scheduler # # ----------------------------------------------- # print("==========> Setting Optimizer: {}".format(opt.optimizer)) optimizer = getOptimizer(model, opt) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=opt.gamma) # ----------------------------------------------- # # Option: resume training process from checkpoint # # ----------------------------------------------- # if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) model, optimizer, _, _, scheduler = utils.loadCheckpoint(opt.resume, model, optimizer, scheduler) else: raise Exception("=> no checkpoint found at '{}'".format(opt.resume)) # ----------------------------------------------- # # Option: load weights from a pretrain network # # ----------------------------------------------- # if opt.pretrained: if os.path.isfile(opt.pretrained): print("=> loading pretrained model '{}'".format(opt.pretrained)) model = utils.loadModel(opt.pretrained, model, True) else: raise Exception("=> no pretrained model found at '{}'".format(opt.pretrained)) # Select training device if opt.cuda: print("==========> Setting GPU") model = nn.DataParallel(model, device_ids=[i for i in range(opt.gpus)]).cuda() criterion = criterion.cuda() if perceptual is not None: perceptual = perceptual.cuda() else: print("==========> Setting CPU") model = model.cpu() criterion = criterion.cpu() if perceptual is not None: perceptual = perceptual.cpu() # Create container length = opt.epochs * len(train_loader) // opt.val_interval loss_iter = np.empty(length, dtype=float) perc_iter = np.empty(length, dtype=float) psnr_iter = np.empty(length, dtype=float) ssim_iter = np.empty(length, dtype=float) mse_iter = np.empty(length, dtype=float) lr_iter = np.empty(length, dtype=float) iterations = np.empty(length, dtype=float) loss_iter[:] = np.nan perc_iter[:] = np.nan psnr_iter[:] = np.nan ssim_iter[:] = np.nan mse_iter[:] = np.nan lr_iter[:] = np.nan iterations[:] = np.nan # Set plotter to plot the loss curves twinx = (opt.perceptual is not None) fig, axis = getFigureSpec(len(train_loader), twinx) # Set Model Saving Function if opt.save_item == "model": print("==========> Save Function: saveModel()") saveCheckpoint = utils.saveModel elif opt.save_item == "checkpoint": print("==========> Save Function: saveCheckpoint()") saveCheckpoint = utils.saveCheckpoint else: raise ValueError("Save Checkpoint Function Error") # Start Training print("==========> Training") for epoch in range(opt.starts, opt.epochs + 1): loss_iter, perc_iter, mse_iter, psnr_iter, ssim_iter, lr_iter, iterations, _, _ = train( model, optimizer, criterion, perceptual, train_loader, val_loader, scheduler, epoch, loss_iter, perc_iter, mse_iter, psnr_iter, ssim_iter, lr_iter, iterations, opt, name, fig, axis, saveCheckpoint ) scheduler.step() # Save the last checkpoint for resume training utils.saveCheckpoint(os.path.join(opt.checkpoints, name, "final.pth"), model, optimizer, scheduler, epoch, len(train_loader)) # TODO: Fine tuning return
targets = targets.squeeze().long().cuda() outputs = F.log_softmax(model(static, time_variying), dim=1) log.update(outputs, targets) log.printLog(epoch) train_loader = DataLoader(data_train, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) test_loader = DataLoader(data_test, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) for epoch in range(args.epochs): print('--Traing epoch {}'.format(epoch)) train(epoch, train_loader, log_tr) print('--Testing...') test(epoch, test_loader, log_te) saveCheckpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'test_log': log_te, 'args': args, }, './log/' + args.prefix + args.optimizer + '_Test/fgam' + str(epoch) + '.pth.tar')
def single_frame_recognition(): """ Using 2D CNN network to recognize the action. """ #----------------------------------------------------- # Create Model, optimizer, scheduler, and loss function #------------------------------------------------------ extractor = resnet50(pretrained=True).to(DEVICE) classifier = Classifier(2048 * opt.sample, [2048], num_class=opt.output_dim).to(DEVICE) print(classifier) # Set optimizer if opt.optimizer == "Adam": optimizer = optim.Adam(classifier.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), weight_decay=opt.weight_decay) elif opt.optimizer == "SGD": optimizer = optim.SGD(classifier.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) elif opt.optimizer == "ASGD": optimizer = optim.ASGD(classifier.parameters(), lr=opt.lr, lambd=1e-4, alpha=0.75, t0=1000000.0, weight_decay=opt.weight_decay) elif opt.optimizer == "Adadelta": optimizer = optim.Adadelta(classifier.parameters(), lr=opt.lr, rho=0.9, eps=1e-06, weight_decay=opt.weight_decay) elif opt.optimizer == "Adagrad": optimizer = optim.Adagrad(classifier.parameters(), lr=opt.lr, lr_decay=0, weight_decay=opt.weight_decay, initial_accumulator_value=0) elif opt.optimizer == "SparseAdam": optimizer = optim.SparseAdam(classifier.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), eps=1e-08) elif opt.optimizer == "Adamax": optimizer = optim.Adamax(classifier.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), eps=1e-08, weight_decay=opt.weight_dacay) else: raise argparse.ArgumentError scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=opt.gamma) criterion = nn.CrossEntropyLoss().to(DEVICE) transform = transforms.ToTensor() trainlabel = os.path.join(opt.train, "label", "gt_train.csv") trainfeature = os.path.join(opt.train, "feature", "train") vallabel = os.path.join(opt.val, "label", "gt_valid.csv") valfeature = os.path.join(opt.val, "feature", "valid") train_set = dataset.TrimmedVideos(None, trainlabel, trainfeature, sample=4, transform=transform) val_set = dataset.TrimmedVideos(None, vallabel, valfeature, sample=4, transform=transform) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=opt.threads) val_loader = DataLoader(val_set, batch_size=opt.batch_size, shuffle=False, drop_last=True, num_workers=opt.threads) # Show the memory used by neural network print("The neural network allocated GPU with {:.1f} MB".format( torch.cuda.memory_allocated() / 1024 / 1024)) #------------------ # Train the models #------------------ trainloss = [] trainaccs = [] valloss = [] valaccs = [] epochs = [] for epoch in range(1, opt.epochs + 1): scheduler.step() # Save the train loss and train accuracy extractor, classifier, loss, acc = train(extractor, classifier, train_loader, optimizer, epoch, criterion) trainloss.append(loss) trainaccs.append(acc) # Save the validation loss and validation accuracy acc, loss = val(extractor, classifier, val_loader, epoch, criterion) valloss.append(loss) valaccs.append(acc) # Save the epochs epochs.append(epoch) records = list( map(lambda x: np.array(x), (trainloss, trainaccs, valloss, valaccs, epochs))) for record, name in zip(records, ('trainloss.txt', 'trainaccs.txt', 'valloss.txt', 'valaccs.txt', 'epochs.txt')): np.savetxt(os.path.join(opt.log, "problem_1", opt.tag, name), record) if epoch % opt.save_interval == 0: savepath = os.path.join(opt.checkpoints, "problem_1", opt.tag, str(epoch) + '.pth') utils.saveCheckpoint(savepath, classifier, optimizer, scheduler, epoch) draw_graphs(trainloss, valloss, trainaccs, valaccs, epochs) return extractor, classifier
def train(model, optimizer, criterion, trainset, logfile_path="./logfile.csv", batch_size=8, shuffle=True, epoch=0, num_epochs=2, checkpoint_dir="./checkpoints", checkpoint_basename="checkpoint_", save_freq=5): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=shuffle, num_workers=8) del trainset torch.cuda.empty_cache() if (os.path.isdir(checkpoint_dir) != True): sys.exit( "Error: Supplied checkpoint directory does not exist or is a file." ) if epoch != 0: print("Resuming training from epoch %d of %d." % (epoch, num_epochs)) print("Batch size is %d, saving checkpoint to %s every %d epochs" % (batch_size, checkpoint_dir, save_freq)) else: print( "Beginning training with batch size %d, saving checkpoint to %s every %d epochs" % (batch_size, checkpoint_dir, save_freq)) print( "Training log output as CSV to: " + logfile_path + ", with headers 'epoch, batch, loss <over previous 10 batches>, time <for previous 10 batches, seconds>" ) with open(logfile_path, 'a') as logfile: logfile.write('epoch, batch, loss, time\n') for epoch in range(epoch, num_epochs): running_loss = 0.0 start_time = time.time() for i, data in enumerate(trainloader, 0): inputs, labels = data inputs = inputs.float() inputs = inputs.to(device) labels = labels.float() labels = labels.to(device) print(inputs.shape) print(labels.shape) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) del outputs, inputs torch.cuda.empty_cache() loss = loss.to(device) loss.backward() optimizer.step() running_loss += loss.item() if i % 10 == 9: duration = time.time() - start_time with open(logfile_path, 'a') as logfile: print('[E: %d, B: %2d] loss: %.3f, took %.3f secs' % (epoch + 1, i + 1, running_loss / 10, duration)) logfile.write('%d, %d, %.3f, %.3f\n' % (epoch, i, running_loss, duration)) running_loss = 0.0 start_time = time.time() if epoch % save_freq == (save_freq - 1): print('Saving checkpoint for epoch %d' % (epoch + 1)) utils.saveCheckpoint( model, epoch, optimizer, loss, (os.path.join(checkpoint_dir, checkpoint_basename) + '%d.pt') % (epoch))
def temporal_action_segmentation(): """ Using RNN network to segmentation the action. """ start_epoch = 1 #------------------------------------------------------ # Create Model, optimizer, scheduler, and loss function #------------------------------------------------------ recurrent = LSTM_Net(2048, opt.hidden_dim, opt.output_dim, num_layers=opt.layers, bias=True, batch_first=False, dropout=opt.dropout, bidirectional=opt.bidirection, seq_predict=True).to(DEVICE) # Weight_init if "orthogonal" in opt.weight_init: for layer, param in recurrent.recurrent.named_parameters(): print("{} {}".format(layer, param.shape)) if len(param.shape) >= 2: nn.init.orthogonal_(param) # Bias_init if "forget_bias_0" in opt.bias_init: for layer, param in recurrent.recurrent.named_parameters(): if layer.startswith("bias"): start = int(param.shape[0] * 0.25) end = int(param.shape[0] * 0.5) param[start:end].data.fill_(0) if "forget_bias_1" in opt.bias_init: for layer, param in recurrent.recurrent.named_parameters(): if layer.startswith("bias"): start = int(param.shape[0] * 0.25) end = int(param.shape[0] * 0.5) param[start:end].data.fill_(1) # Set optimizer if opt.optimizer == "Adam": optimizer = optim.Adam(recurrent.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), weight_decay=opt.weight_decay) elif opt.optimizer == "SGD": optimizer = optim.SGD(recurrent.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) elif opt.optimizer == "ASGD": optimizer = optim.ASGD(recurrent.parameters(), lr=opt.lr, lambd=1e-4, alpha=0.75, t0=1000000.0, weight_decay=opt.weight_decay) elif opt.optimizer == "Adadelta": optimizer = optim.Adadelta(recurrent.parameters(), lr=opt.lr, rho=0.9, eps=1e-06, weight_decay=opt.weight_decay) elif opt.optimizer == "Adagrad": optimizer = optim.Adagrad(recurrent.parameters(), lr=opt.lr, lr_decay=0, weight_decay=opt.weight_decay, initial_accumulator_value=0) elif opt.optimizer == "SparseAdam": optimizer = optim.SparseAdam(recurrent.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), eps=1e-08) elif opt.optimizer == "Adamax": optimizer = optim.Adamax(recurrent.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), eps=1e-08, weight_decay=opt.weight_dacay) else: raise argparse.ArgumentError scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=opt.milestones, gamma=opt.gamma) # Load parameter if opt.pretrain: recurrent = utils.loadModel(opt.pretrain, recurrent) print("Loaded pretrain model: {}".format(opt.pretrain)) if opt.resume: recurrent, optimizer, start_epoch, scheduler = utils.loadCheckpoint( opt.resume, recurrent, optimizer, scheduler) print("Resume training: {}".format(opt.resume)) # Set criterion criterion = nn.CrossEntropyLoss().to(DEVICE) # Set dataloader transform = transforms.ToTensor() trainlabel = os.path.join(opt.train, "labels", "train") trainfeature = os.path.join(opt.train, "feature", "train") vallabel = os.path.join(opt.val, "labels", "valid") valfeature = os.path.join(opt.val, "feature", "valid") train_set = dataset.FullLengthVideos( None, trainlabel, trainfeature, downsample=opt.train_downsample, transform=transform, summarize=opt.summarize, sampling=opt.sampling, ) train_loader = DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, collate_fn=utils.collate_fn_seq, num_workers=opt.threads) val_set = dataset.FullLengthVideos( None, vallabel, valfeature, downsample=opt.val_downsample, transform=transform, summarize=None, sampling=0, ) val_loader = DataLoader(val_set, batch_size=1, shuffle=False, collate_fn=utils.collate_fn_seq, num_workers=opt.threads) val_set_2 = dataset.FullLengthVideos(None, vallabel, valfeature, downsample=opt.train_downsample, transform=transform, summarize=None, sampling=0) val_loader_2 = DataLoader(val_set_2, batch_size=1, shuffle=False, collate_fn=utils.collate_fn_seq, num_workers=opt.threads) # Show the memory used by neural network print("The neural network allocated GPU with {:.1f} MB".format( torch.cuda.memory_allocated() / 1024 / 1024)) #------------------ # Train the models #------------------ trainloss, trainaccs, valloss, valaccs = [], [], [], [] epochs = [] categories = [name.split('.')[0] for name in os.listdir(valfeature)] # Pre-test of the pretrain model acc, loss = val(recurrent, val_loader, 0, criterion) valloss.append(loss) valaccs.append(acc) epochs.append(0) for epoch in range(start_epoch, opt.epochs + 1): scheduler.step() # Save the train loss and train accuracy max_trainaccs = max(trainaccs) if len(trainaccs) > 0 else 0 min_trainloss = min(trainloss) if len(trainloss) > 0 else 0 recurrent, acc, loss = train(recurrent, train_loader, optimizer, epoch, criterion, max_trainaccs, min_trainloss) trainloss.append(loss) trainaccs.append(acc) # validate the model with several downsample ratio acc, loss = val(recurrent, val_loader, epoch, criterion) valloss.append(loss) valaccs.append(acc) acc, loss = val(recurrent, val_loader_2, epoch, criterion, visual=False) # Save the epochs epochs.append(epoch) for x, y in ((trainloss, "trainloss.txt"), (trainaccs, "trainaccs.txt"), (valloss, "valloss.txt"), (valaccs, "valaccs.txt"), (epochs, "epochs.txt")): np.savetxt(os.path.join(opt.log, "problem_3", opt.tag, y), np.array(x)) if epoch % opt.save_interval == 0: savepath = os.path.join(opt.checkpoints, "problem_3", opt.tag, str(epoch) + '.pth') utils.saveCheckpoint(savepath, recurrent, optimizer, scheduler, epoch) # Draw the accuracy / loss curve draw_graphs(trainloss, valloss, trainaccs, valaccs, epochs, label=categories) return recurrent
def trainingLoop(trainDataset, valDataset, batchSize, samplingMode, cpcModel, cpcCriterion, nEpoch, optimizer, scheduler, pathCheckpoint, logs, useGPU, log2Board, experiment): print(f"Running {nEpoch} epochs") startEpoch = len(logs["epoch"]) bestAcc = 0 bestStateDict = None startTime = time.time() epoch = 0 totalSteps = 0 try: for epoch in range(startEpoch, nEpoch): print(f"Starting epoch {epoch}") trainLoader = trainDataset.getDataLoader(batchSize, samplingMode, True, numWorkers=0) valLoader = valDataset.getDataLoader(batchSize, samplingMode, False, numWorkers=0) print( "Training dataset %d batches, Validation dataset %d batches, batch size %d" % (len(trainLoader), len(valLoader), batchSize)) locLogsTrain = trainStep(trainLoader, cpcModel, cpcCriterion, optimizer, scheduler, logs["loggingStep"], useGPU, log2Board, totalSteps, experiment) totalSteps += locLogsTrain['iter'] locLogsVal = valStep(valLoader, cpcModel, cpcCriterion, useGPU, log2Board) print(f'Ran {epoch + 1} epochs ' f'in {time.time() - startTime:.2f} seconds') if useGPU: torch.cuda.empty_cache() currentAccuracy = float(locLogsVal["locAcc_val"].mean()) if log2Board: for t in range(len(locLogsVal["locLoss_val"])): experiment.log_metric(f"Losses/epoch/locLoss_train_{t}", locLogsTrain["locLoss_train"][t], step=epoch) experiment.log_metric(f"Accuracy/epoch/locAcc_train_{t}", locLogsTrain["locAcc_train"][t], step=epoch) experiment.log_metric(f"Losses/epoch/locLoss_val_{t}", locLogsVal["locLoss_val"][t], step=epoch) experiment.log_metric(f"Accuracy/epoch/locAcc_val_{t}", locLogsVal["locAcc_val"][t], step=epoch) if log2Board > 1: experiment.log_confusion_matrix( locLogsTrain["targets"], locLogsTrain["predictions"], epoch=epoch, title=f"Confusion matrix train set, Step #{epoch}", file_name=f"confusion-matrix-train-{epoch}.json", ) experiment.log_confusion_matrix( locLogsVal["targets"], locLogsVal["predictions"], epoch=epoch, title=f"Confusion matrix validation set, Step #{epoch}", file_name=f"confusion-matrix-val-{epoch}.json", ) if currentAccuracy > bestAcc: bestStateDict = cpcModel.state_dict() for key, value in dict(locLogsTrain, **locLogsVal).items(): if key not in logs: logs[key] = [None for _ in range(epoch)] if isinstance(value, np.ndarray): value = value.tolist() logs[key].append(value) logs["epoch"].append(epoch) if pathCheckpoint is not None and (epoch % logs["saveStep"] == 0 or epoch == nEpoch - 1): modelStateDict = cpcModel.state_dict() criterionStateDict = cpcCriterion.state_dict() saveCheckpoint(modelStateDict, criterionStateDict, optimizer.state_dict(), bestStateDict, f"{pathCheckpoint}_{epoch}.pt") saveLogs(logs, pathCheckpoint + "_logs.json") except KeyboardInterrupt: if pathCheckpoint is not None: modelStateDict = cpcModel.state_dict() criterionStateDict = cpcCriterion.state_dict() saveCheckpoint(modelStateDict, criterionStateDict, optimizer.state_dict(), bestStateDict, f"{pathCheckpoint}_{epoch}_interrupted.pt") saveLogs(logs, pathCheckpoint + "_logs.json") return
def train(model, criterion, optimizer, scheduler, train_loader, val_loader, start_epochs, epochs, device, grid_num=7, lr=0.001, log_interval=10, save_name="Yolov1"): model.train() epochs_list = [] train_loss_list = [] val_loss_list = [] val_mean_aps = [] for epoch in range(start_epochs + 1, epochs + 1): model.train() if scheduler: scheduler.step() iteration = 0 train_loss = 0 # Train and backpropagation for batch_idx, (data, target, _) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() train_loss += loss.item() if (iteration % log_interval == 0): print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) iteration += 1 train_loss /= iteration val_loss = test_loss(model, criterion, val_loader, device) epochs_list.append(epoch) train_loss_list.append(train_loss) val_loss_list.append(val_loss) print("*** Train set - Average loss: {:.4f}".format(train_loss)) print("*** Test set - Average loss: {:.4f}".format(val_loss)) if epoch > 0: mean_ap = test_map(model, criterion, val_loader, device, grid_num=14) val_mean_aps.append(mean_ap) if mean_ap >= max(val_mean_aps): utils.saveCheckpoint(save_name + "-{}.pth".format(epoch), model, optimizer, scheduler, epoch) else: val_mean_aps.append(0) plt.clf() plt.plot(epochs_list, train_loss_list, label="Training loss") plt.plot(epochs_list, val_loss_list, label="Validation loss") plt.legend(loc=0) plt.title("Loss vs Epochs") plt.savefig("loss.png") plt.clf() plt.plot(epochs_list, val_mean_aps, label="mAP") plt.legend(loc=0) plt.title("mAP vs Epochs") plt.savefig("mAP.png") return model
def main(): # load the dataset srDs = dm.SRDB291() # load the network #net = nt.VDSR net = nt.VDSR() if not os.path.exists(conf.MODEL_PATH): os.mkdir(conf.MODEL_PATH) if not os.path.exists(conf.LOG_PATH): os.mkdir(conf.LOG_PATH) sumWriter = tbx.SummaryWriter(log_dir=conf.LOG_PATH) net_stat, epoch, iterr, globalStep = utils.loadLatestCheckpoint() if net_stat is None: print('No previous model found, start training now') else: net.load_state_dict(net_stat) print('The lastest version is Epoch %d, iter is %d,GLoablStep: %d' % (epoch, iterr, globalStep)) #globalStep += 1 # loss criterion criterion = nn.MSELoss() # Use GPU if conf.GPU_FLAG: if not torch.cuda.is_available(): raise Exception( 'No GPU found or wrong gpu id. Switch the GPU_FLAG to False') print("Training powered by GPU, NO.{}".format(conf.GPUS)) torch.cuda.manual_seed(conf.SEED) net.cuda() criterion.cuda() # get the dataset's partner ready: dataloader dataloader = torch.utils.data.DataLoader(srDs, batch_size=conf.BATCH_SIZE, shuffle=True, num_workers=conf.NUM_WORKERS) # optimizer if conf.WEIGHT_DECAY == 0: optimizer = optim.Adam([ { 'params': net.conv1.weight, 'lr': 1 * conf.LR }, { 'params': net.conv1.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv2.weight, 'lr': 1 * conf.LR }, { 'params': net.conv2.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv3.weight, 'lr': 1 * conf.LR }, { 'params': net.conv3.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv4.weight, 'lr': 1 * conf.LR }, { 'params': net.conv4.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv5.weight, 'lr': 1 * conf.LR }, { 'params': net.conv5.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv6.weight, 'lr': 1 * conf.LR }, { 'params': net.conv6.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv7.weight, 'lr': 1 * conf.LR }, { 'params': net.conv7.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv8.weight, 'lr': 1 * conf.LR }, { 'params': net.conv8.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv9.weight, 'lr': 1 * conf.LR }, { 'params': net.conv9.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv10.weight, 'lr': 1 * conf.LR }, { 'params': net.conv10.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv11.weight, 'lr': 1 * conf.LR }, { 'params': net.conv11.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv12.weight, 'lr': 1 * conf.LR }, { 'params': net.conv12.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv13.weight, 'lr': 1 * conf.LR }, { 'params': net.conv13.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv14.weight, 'lr': 1 * conf.LR }, { 'params': net.conv14.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv15.weight, 'lr': 1 * conf.LR }, { 'params': net.conv15.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv16.weight, 'lr': 1 * conf.LR }, { 'params': net.conv16.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv17.weight, 'lr': 1 * conf.LR }, { 'params': net.conv17.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv18.weight, 'lr': 1 * conf.LR }, { 'params': net.conv18.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv19.weight, 'lr': 1 * conf.LR }, { 'params': net.conv19.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv20.weight, 'lr': 1 * conf.LR }, { 'params': net.conv20.bias, 'lr': 0.1 * conf.LR }, ], lr=conf.LR) else: #optimizer = optim.Adam(net.parameters(),lr=conf.LR,weight_decay=conf.WEIGHT_DECAY) optimizer = optim.Adam([ { 'params': net.conv1.weight, 'lr': 1 * conf.LR }, { 'params': net.conv1.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_0.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_0.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_1.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_1.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_2.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_2.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_3.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_3.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_4.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_4.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_5.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_5.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_6.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_6.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_7.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_7.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_8.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_8.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_9.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_9.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_10.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_10.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_11.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_11.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_12.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_12.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_13.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_13.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_14.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_14.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_15.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_15.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_16.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_16.bias, 'lr': 0.1 * conf.LR }, { 'params': net.convRelu2.op.conv_17.weight, 'lr': 1 * conf.LR }, { 'params': net.convRelu2.op.conv_17.bias, 'lr': 0.1 * conf.LR }, { 'params': net.conv3.weight, 'lr': 1 * conf.LR }, { 'params': net.conv3.bias, 'lr': 0.1 * conf.LR }, ], lr=conf.LR, weight_decay=conf.WEIGHT_DECAY) net.train() scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=math.sqrt(0.1)) # numerial statistic visible for training console AvgFreq = 0 Avgloss = 0 ## TRAINING EPOCH for epoch_pos in range( epoch): # set the current lr according to the epochs passed scheduler.step() for epoch_pos in range(epoch, conf.MAX_Epoch): print('----------------------- Epoch %d ----------------------------' % (epoch_pos)) scheduler.step() for iterNum, (y, x) in enumerate(dataloader, iterr + 1): globalStep += 1 if conf.GPU_FLAG: x = x.cuda() y = y.cuda() startTime = time.time() ##### optimizer.zero_grad() # pred = net(x) loss = criterion(pred, y - x) # loss.backward() # utils.gradientClip( net.parameters(), conf.GRADIENT_CLIP_THETA / utils.get_lr(optimizer)) optimizer.step() #### endTime = time.time() lossData = loss.cpu().detach().numpy() loss.cuda() AvgFreq += endTime - startTime Avgloss += lossData if loss < 0.01 and iterNum % conf.SUM_INTERVAL == 0 and iterNum is not 0: sumWriter.add_scalar('Data/loss', loss, globalStep) if iterNum % (2 * conf.SUM_INTERVAL) == 0: psnr = utils.testPSNR(net, conf.TEST_SET_PATH, 2) sumWriter.add_scalar('Data/psnr_Set14_2', psnr, globalStep) else: sumWriter.add_scalar( 'Data/loss', torch.Tensor(np.array(conf.SUMMARY_SCALAR_FIX)), globalStep) sumWriter.add_scalar('Data/loss', loss, globalStep) if iterNum % conf.PRINT_INTERVAL == 0 and iterNum is not 0: AvgFreq = (conf.PRINT_INTERVAL * conf.BATCH_SIZE) / AvgFreq Avgloss = Avgloss / conf.PRINT_INTERVAL format_str = '%s: Iters %d, average loss(255) = %.7f, GI:%d ,lr:%.7f, g:%.6f, average frequency = %.3f(HZ) (batch per sec)' ## get the gradient: g = 0 c = 0 for param in net.parameters(): g += math.fabs(param.grad.data.sum()) c += 1.0 g = g / c if iterNum % conf.SAVE_INTERVAL == 0: print(format_str % (datetime.now(), iterNum, math.sqrt(Avgloss) * 255, globalStep, utils.get_lr(optimizer), g, AvgFreq), end='') else: print(format_str % (datetime.now(), iterNum, math.sqrt(Avgloss) * 255, globalStep, utils.get_lr(optimizer), g, AvgFreq)) AvgFreq = 0 Avgloss = 0 if iterNum % conf.SUM_INTERVAL == 0 and iterNum is not 0: sumWriter.add_image('Data/input', x, globalStep) sumWriter.add_image('Data/Label', y, globalStep) sumWriter.add_image('Data/result', pred + x, globalStep) sumWriter.add_image('Data/pred', pred, globalStep) sumWriter.add_image('Data/residualToLearn', y - x, globalStep) sumWriter.add_image('Data/Delta', torch.abs(pred - (y - x)), globalStep) # for better visualization nc = np.random.randint(0, conf.BATCH_SIZE - 1) xnc = x[nc, :, :, :] ync = y[nc, :, :, :] prnc = pred[nc, :, :, :] sumWriter.add_image('Vis/input', xnc, globalStep) sumWriter.add_image('Vis/Label', ync, globalStep) sumWriter.add_image('Vis/result', prnc + xnc, globalStep) sumWriter.add_image('Vis/pred', prnc, globalStep) sumWriter.add_image('Vis/residualToLearn', ync - xnc, globalStep) if iterNum % conf.SAVE_INTERVAL == 0 and iterNum is not 0: utils.saveCheckpoint(net.state_dict(), epoch_pos, iterNum, globalStep) print('...... SAVED') iterr = -1