def experiment(exp_name, device, eval_range='all', plot=True): config, _, _, _ = load_config(exp_name) net, loss_fn = build_model(config, device, train=False) state_dict = torch.load(get_model_name(config), map_location=device) if config['mGPUs']: net.module.load_state_dict(state_dict) else: net.load_state_dict(state_dict) train_loader, val_loader = get_data_loader( config['batch_size'], config['use_npy'], geometry=config['geometry'], frame_range=config['frame_range']) #Train Set train_metrics, train_precisions, train_recalls, _ = eval_batch( config, net, loss_fn, train_loader, device, eval_range) print("Training mAP", train_metrics['AP']) fig_name = "PRCurve_train_" + config['name'] legend = "AP={:.1%} @IOU=0.5".format(train_metrics['AP']) plot_pr_curve(train_precisions, train_recalls, legend, name=fig_name) # Val Set val_metrics, val_precisions, val_recalls, _ = eval_batch( config, net, loss_fn, val_loader, device, eval_range) print("Validation mAP", val_metrics['AP']) print("Net Fwd Pass Time on average {:.4f}s".format( val_metrics['Forward Pass Time'])) print("Nms Time on average {:.4f}s".format( val_metrics['Postprocess Time'])) fig_name = "PRCurve_val_" + config['name'] legend = "AP={:.1%} @IOU=0.5".format(val_metrics['AP']) plot_pr_curve(val_precisions, val_recalls, legend, name=fig_name)
def train(config_name, device): config, learning_rate, batch_size, max_epochs = load_config(config_name) train_data_loader, test_data_loader = get_data_loader( batch_size=batch_size, use_npy=config['use_npy'], frame_range=config['frame_range']) net, criterion, optimizer, scheduler = build_model(config, device, train=True) if config['resume_training']: saved_ckpt_path = get_model_name(config['old_ckpt_name']) net.load_state_dict(torch.load(saved_ckpt_path, map_location=device)) print("Successfully loaded trained ckpt at {}".format(saved_ckpt_path)) net.train() #net.backbone.conv1.register_forward_hook(printnorm) #net.backbone.conv2.register_backward_hook(printgradnorm) start_time = time.time() for epoch in range(max_epochs): train_loss = 0 num_samples = 0 scheduler.step() print("Learning Rate for Epoch {} is {} ".format( epoch + 1, scheduler.get_lr())) for i, (input, label_map) in enumerate(train_data_loader): input = input.to(device) label_map = label_map.to(device) optimizer.zero_grad() # Forward predictions = net(input) loss = criterion(predictions, label_map) loss.backward() optimizer.step() train_loss += float(loss) num_samples += label_map.shape[0] train_loss = train_loss * batch_size / num_samples val_loss = validate_batch(net, criterion, batch_size, test_data_loader, device) print("Epoch {}|Time {:.3f}|Training Loss: {}|Validation Loss: {}". format(epoch + 1, time.time() - start_time, train_loss, val_loss)) if (epoch + 1) == max_epochs or (epoch + 1) % config['save_every'] == 0: model_path = get_model_name(config['name'] + '__epoch{}'.format(epoch + 1)) torch.save(net.state_dict(), model_path) print("Checkpoint saved at {}".format(model_path)) print('Finished Training') end_time = time.time() elapsed_time = end_time - start_time print("Total time elapsed: {:.2f} seconds".format(elapsed_time))
def experiment(config_name, device): config, _, _, _ = load_config(config_name) net, criterion = build_model(config, device, train=False) net.load_state_dict( torch.load(get_model_name(config['name']), map_location=device)) net.set_decode(True) loader, _ = get_data_loader(batch_size=1, use_npy=config['use_npy'], frame_range=config['frame_range']) net.eval() image_id = 25 threshold = config['cls_threshold'] with torch.no_grad(): input, label_map = loader.dataset[image_id] input = input.to(device) label_map = label_map.to(device) label_map_unnorm, label_list = loader.dataset.get_label(image_id) # Forward Pass t_start = time.time() pred = net(input.unsqueeze(0)).squeeze_(0) print("Forward pass time", time.time() - t_start) # Select all the bounding boxes with classification score above threshold cls_pred = pred[..., 0] activation = cls_pred > threshold # Compute (x, y) of the corners of selected bounding box num_boxes = int(activation.sum()) if num_boxes == 0: print("No bounding box found") return corners = torch.zeros((num_boxes, 8)) for i in range(1, 9): corners[:, i - 1] = torch.masked_select(pred[..., i], activation) corners = corners.view(-1, 4, 2).numpy() scores = torch.masked_select(pred[..., 0], activation).numpy() # NMS t_start = time.time() selected_ids = non_max_suppression(corners, scores, config['nms_iou_threshold']) corners = corners[selected_ids] scores = scores[selected_ids] print("Non max suppression time:", time.time() - t_start) # Visualization input_np = input.cpu().numpy() plot_bev(input_np, label_list, window_name='GT') plot_bev(input_np, corners, window_name='Prediction') plot_label_map(cls_pred.numpy())
def quant_experiment(exp_name, device, epoch): config = load_config(exp_name) config['augmentation'] = False num_bits = config['num_bits'] net, loss_fn = build_model(config, device, train=False) model_path, exist_best_model = get_model_path(config, epoch) assert exist_best_model, "There is no model" checkpoint = torch.load(model_path, map_location=device) weights = checkpoint['model_state_dict'] if 'weight_scale' in checkpoint.keys(): weight_scale_list = checkpoint['weight_scale'] print("weight scale loaded") weights, weight_scale_list = weights_quant_with_scale( weights, weight_scale_list, num_bits) print("weight scale completed") if 'act_scale' in checkpoint.keys(): act_scale_list = checkpoint['act_scale'] print("act scale loaded") if config['mGPUs']: net.module.load_state_dict(weights) else: net.load_state_dict(weights) train_loader, val_loader = get_data_loader(config) # train_metrics = evaluation(config, net, loss_fn, train_loader, device) # print("------Training Result------") # print("Prec@1 : ", train_metrics['top1']) # print("Prec@2 : ", train_metrics['top5']) # print("Forward Pass Time: ", train_metrics['Forward Pass Time']) # print("loss : ", train_metrics['loss']) if 'act_scale' in checkpoint.keys(): val_metrics = quant_evaluation(net, loss_fn, val_loader, device, act_scale_list, num_bits) else: val_metrics = evaluation(config, net, loss_fn, val_loader, device) print("------Validation Result------") print("Prec@1 : ", val_metrics['top1']) print("Prec@5 : ", val_metrics['top5']) print("Forward Pass Time: ", val_metrics['Forward Pass Time']) print("loss : ", val_metrics['loss'])
def experiment(exp_name, device, epoch): config = load_config(exp_name) config['augmentation'] = False net, loss_fn = build_model(config, device, train=False) model_path, exist_best_model = get_model_path(config, epoch) assert exist_best_model, "There is no model" checkpoint = torch.load(model_path, map_location=device) if 'model_state_dict' in checkpoint.keys(): weights = checkpoint['model_state_dict'] else: weights = checkpoint if config['mGPUs']: net.module.load_state_dict(weights) else: net.load_state_dict(weights) train_loader, val_loader = get_data_loader(config) # train_metrics = evaluation(config, net, loss_fn, train_loader, device) # print("------Training Result------") # print("Prec@1 : ", train_metrics['top1']) # print("Prec@2 : ", train_metrics['top5']) # print("Forward Pass Time: ", train_metrics['Forward Pass Time']) # print("loss : ", train_metrics['loss']) val_metrics = evaluation(config, net, loss_fn, val_loader, device) print("------Validation Result------") print("Prec@1 : ", val_metrics['top1']) print("Prec@5 : ", val_metrics['top5']) print("Forward Pass Time: ", val_metrics['Forward Pass Time']) print("loss : ", val_metrics['loss']) if 'model_state_dict' not in checkpoint.keys(): torch.save({ 'model_state_dict': weights, 'top1': val_metrics['top1'] }, model_path) print("model saved at ", model_path)
def test(exp_name, device, image_id): config, _, _, _ = load_config(exp_name) net, loss_fn = build_model(config, device, train=False) net.load_state_dict(torch.load(get_model_name(config), map_location=device)) net.set_decode(True) train_loader, val_loader = get_data_loader( 1, config['use_npy'], geometry=config['geometry'], frame_range=config['frame_range']) net.eval() with torch.no_grad(): num_gt, num_pred, scores, pred_image, pred_match, loss, t_forward, t_nms = \ eval_one(net, loss_fn, config, train_loader, image_id, device, plot=True) TP = (pred_match != -1).sum() print("Loss: {:.4f}".format(loss)) print("Precision: {:.2f}".format(TP / num_pred)) print("Recall: {:.2f}".format(TP / num_gt)) print("forward pass time {:.3f}s".format(t_forward)) print("nms time {:.3f}s".format(t_nms))
def train(exp_name, device): # Load Hyperparameters config, learning_rate, batch_size, max_epochs = load_config(exp_name) # Dataset and DataLoader train_data_loader, test_data_loader = get_data_loader( batch_size, config['use_npy'], geometry=config['geometry'], frame_range=config['frame_range']) # Model net, loss_fn, optimizer, scheduler = build_model(config, device, train=True) # Tensorboard Logger train_logger = get_logger(config, 'train') val_logger = get_logger(config, 'val') if config['resume_training']: saved_ckpt_path = get_model_name(config) if config['mGPUs']: net.module.load_state_dict( torch.load(saved_ckpt_path, map_location=device)) else: net.load_state_dict( torch.load(saved_ckpt_path, map_location=device)) print("Successfully loaded trained ckpt at {}".format(saved_ckpt_path)) st_epoch = config['resume_from'] else: # writefile(config, 'train_loss.csv', 'iteration, cls_loss, loc_loss\n') # writefile(config, 'val_loss.csv', 'epoch, cls_loss, loc_loss\n') st_epoch = 0 step = 1 + st_epoch * len(train_data_loader) cls_loss = 0 loc_loss = 0 for epoch in range(st_epoch, max_epochs): start_time = time.time() train_loss = 0 net.train() if config['mGPUs']: net.module.set_decode(False) else: net.set_decode(False) scheduler.step() for input, label_map, image_id in train_data_loader: tic = time.time() #print('step', step) input = input.to(device) label_map = label_map.to(device) optimizer.zero_grad() # Forward predictions = net(input) loss, cls, loc = loss_fn(predictions, label_map) loss.backward() optimizer.step() cls_loss += cls loc_loss += loc train_loss += loss.item() if step % config['log_every'] == 0: cls_loss = cls_loss / config['log_every'] loc_loss = loc_loss / config['log_every'] train_logger.scalar_summary('cls_loss', cls_loss, step) train_logger.scalar_summary('loc_loss', loc_loss, step) cls_loss = 0 loc_loss = 0 #for tag, value in net.named_parameters(): # tag = tag.replace('.', '/') # train_logger.histo_summary(tag, value.data.cpu().numpy(), step) # train_logger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), step) step += 1 #print(time.time() - tic) # Record Training Loss train_loss = train_loss / len(train_data_loader) train_logger.scalar_summary('loss', train_loss, epoch + 1) print("Epoch {}|Time {:.3f}|Training Loss: {:.5f}".format( epoch + 1, time.time() - start_time, train_loss)) # Run Validation # if (epoch +1) % 2 == 0: # tic = time.time() # val_metrics, _, _, log_images = eval_batch(config, net, loss_fn, test_data_loader, device) # for tag, value in val_metrics.items(): # val_logger.scalar_summary(tag, value, epoch + 1) # val_logger.image_summary('Predictions', log_images, epoch + 1) # print("Epoch {}|Time {:.3f}|Validation Loss: {:.5f}".format( # epoch + 1, time.time() - tic, val_metrics['loss'])) # Save Checkpoint if (epoch + 1) == max_epochs or (epoch + 1) % config['save_every'] == 0: model_path = get_model_name(config, epoch + 1) if config['mGPUs']: torch.save(net.module.state_dict(), model_path) else: torch.save(net.state_dict(), model_path) print("Checkpoint saved at {}".format(model_path)) print('Finished Training')
def quant_train(exp_name, device, epoch): num_bits = 8 config = load_config(exp_name) config['resume_training'] = True config['resume_from'] = 0 max_epochs = config['max_epochs'] print("make data loader") train_data_loader, val_data_loader = get_data_loader(config) net, loss_fn, optimizer, scheduler = build_model(config, device, train=True) ckpt_path, exist_model = get_model_path(config, epoch) best_top1 = 0 if exist_model: checkpoint = torch.load(ckpt_path) if 'top1' in checkpoint.keys(): best_top1 = checkpoint['top1'] print("best top1 score is {:.3f}".format(best_top1)) best_top1 = best_top1 * 0.9 # 90% accuracy else: print() saved_ckpt_path = ckpt_path checkpoint = torch.load(saved_ckpt_path, map_location=device) if 'model_state_dict' in checkpoint.keys(): weights = checkpoint['model_state_dict'] else: weights = checkpoint if config['mGPUs']: net.module.load_state_dict(weights) else: net.load_state_dict(weights) print("Successfully loaded trained ckpt at {}".format(saved_ckpt_path)) st_epoch = 0 for g in optimizer.param_groups: g['lr'] = config['learning_rate'] # quant weight quant_weights, weight_scale_list = weights_quant(weights, num_bits=num_bits) # quant_weights, weight_scale_list = weights_quant_b(net, val_data_loader.dataset, device, weights, num_bits=num_bits) net.load_state_dict(quant_weights) for epoch in range(st_epoch, max_epochs): start_time = time.time() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() net.train() print("Epoch {}, learning rate : {}".format( epoch + 1, scheduler.optimizer.state_dict()['param_groups'][0]['lr'])) # train start = time.time() for input, target in tqdm(train_data_loader): # measure data loading time data_time.update(time.time() - start) input_var = input.to(device) target_var = target.to(device) if config['model'] == 'tf' or config['model'] == 'tf_fused': input_var[:, 0, :, :] = (input_var[:, 0, :, :] * 0.229 + 0.485) * 255 - 123.68 input_var[:, 1, :, :] = (input_var[:, 1, :, :] * 0.224 + 0.456) * 255 - 116.78 input_var[:, 2, :, :] = (input_var[:, 2, :, :] * 0.225 + 0.406) * 255 - 103.94 # compute output output = net(input_var) if output.shape[1] == 1001: target_var += 1 loss = loss_fn(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target_var, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and optimizer step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - start) start = time.time() scheduler.step() print("Epoch {}|Time {:.3f}|Training Loss: {:.5f}".format( epoch + 1, time.time() - start_time, losses.avg)) print( "\033[32m training || Prec@1: {:.3f} | Prec@5: {:.3f} \033[37m". format(top1.avg, top5.avg)) # validation before quantize tic_val = time.time() val_metrics = evaluation(config, net, loss_fn, val_data_loader, device) print("Epoch {}|Time {:.3f}|Validation Loss: {:.5f}".format( epoch + 1, time.time() - tic_val, val_metrics['loss'])) print( "\033[32m validation || Prec@1: {:.3f} | Prec@5: {:.3f} \033[37m". format(val_metrics['top1'], val_metrics['top5'])) # save recent model # quant weight if epoch == 0: weights = net.state_dict() quant_weights, weight_scale_list = weights_quant(weights, num_bits=num_bits) # quant_weights, weight_scale_list = weights_quant_b(net, val_data_loader.dataset, device, weights, num_bits=num_bits) net.load_state_dict(quant_weights) else: weights = net.state_dict() # quant_weights, weight_scale_list = weights_quant_with_scale(weights, weight_scale_list, num_bits) quant_weights, weight_scale_list = weights_quant(weights, num_bits=num_bits) net.load_state_dict(quant_weights) # validation after quantize tic_val = time.time() val_metrics = evaluation(config, net, loss_fn, val_data_loader, device) print("Epoch {}|Time {:.3f}|Validation Loss: {:.5f}".format( epoch + 1, time.time() - tic_val, val_metrics['loss'])) print( "\033[32m validation || Prec@1: {:.3f} | Prec@5: {:.3f} \033[37m". format(val_metrics['top1'], val_metrics['top5'])) # save best model if val_metrics['top1'] > best_top1: if config['mGPUs']: torch.save( { 'model_state_dict': net.module.state_dict(), 'top1': val_metrics['top1'], 'weight_scale': weight_scale_list }, ckpt_path[:-4] + '_quant.pth') else: torch.save( { 'model_state_dict': net.state_dict(), 'top1': val_metrics['top1'], 'weight_scale': weight_scale_list }, ckpt_path[:-4] + '_quant.pth') print("\033[32m Best model saved at {}. Prec@1 is {} \033[37m". format(ckpt_path[:-4] + '_quant.pth', val_metrics['top1'])) best_top1 = val_metrics['top1']
def train(exp_name, device, epoch): config = load_config(exp_name) max_epochs = config['max_epochs'] print("make data loader") train_data_loader, val_data_loader = get_data_loader(config) net, loss_fn, optimizer, scheduler = build_model(config, device, train=True) best_ckpt_path, exist_best_model = get_model_path(config, "best") best_top1 = 0 if exist_best_model: best_checkpoint = torch.load(best_ckpt_path) best_top1 = best_checkpoint['top1'] print("best top1 score is {:.3f}".format(best_top1)) if config['resume_training']: saved_ckpt_path = get_model_path(config, "epoch") checkpoint = torch.load(saved_ckpt_path, map_location=device) if config['mGPUs']: net.module.load_state_dict(checkpoint['model_state_dict']) else: net.load_state_dict(checkpoint['model_state_dict']) print("Successfully loaded trained ckpt at {}".format(saved_ckpt_path)) st_epoch = config['resume_from'] else: st_epoch = 0 for g in optimizer.param_groups: g['lr'] = config['learning_rate'] for epoch in range(st_epoch, max_epochs): start_time = time.time() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() net.train() print("Epoch {}, learning rate : {}".format( epoch + 1, scheduler.optimizer.state_dict()['param_groups'][0]['lr'])) # train start = time.time() for input, target in tqdm(train_data_loader): # measure data loading time data_time.update(time.time() - start) input_var = input.to(device) target_var = target.to(device) # compute output output = net(input_var) loss = loss_fn(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target_var, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and optimizer step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - start) start = time.time() scheduler.step() print("Epoch {}|Time {:.3f}|Training Loss: {:.5f}".format( epoch + 1, time.time() - start_time, losses.avg)) print( "\033[32m training || Prec@1: {:.3f} | Prec@5: {:.3f} \033[37m". format(top1.avg, top5.avg)) # validation tic_val = time.time() val_metrics = evaluation(config, net, loss_fn, val_data_loader, device) print("Epoch {}|Time {:.3f}|Validation Loss: {:.5f}".format( epoch + 1, time.time() - tic_val, val_metrics['loss'])) print( "\033[32m validation || Prec@1: {:.3f} | Prec@5: {:.3f} \033[37m". format(val_metrics['top1'], val_metrics['top5'])) # save best model if val_metrics['top1'] > best_top1: if config['mGPUs']: torch.save( { 'model_state_dict': net.module.state_dict(), 'top1': val_metrics['top1'] }, best_ckpt_path) else: torch.save( { 'model_state_dict': net.state_dict(), 'top1': val_metrics['top1'] }, best_ckpt_path) print("\033[32m Best model saved at {}. Prec@1 is {} \033[37m". format(best_ckpt_path, val_metrics['top1'])) best_top1 = val_metrics['top1']