def main(): # Get args from command line args = get_args_from_command_line() if args.gpu_id is not None: cfg.CONST.DEVICE = args.gpu_id if args.weights is not None: cfg.CONST.WEIGHTS = args.weights # Print config print('Use config:') pprint(cfg) # Set GPU to use os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CONST.DEVICE # Start train/test process if not args.test and not args.inference: train_net(cfg) else: if 'WEIGHTS' not in cfg.CONST or not os.path.exists(cfg.CONST.WEIGHTS): logging.error('Please specify the file path of checkpoint.') sys.exit(2) if args.test: test_net(cfg) else: inference_net(cfg)
def main(): # Get args from command line args = get_args_from_command_line() if args.gpu_id is not None: cfg.CONST.DEVICE = args.gpu_id if not args.randomize: np.random.seed(cfg.CONST.RNG_SEED) if args.batch_size is not None: cfg.CONST.BATCH_SIZE = args.batch_size if args.iter is not None: cfg.TRAIN.NUM_ITERATION = args.iter if args.out_path is not None: cfg.DIR.OUT_PATH = args.out_path if args.weights is not None: cfg.CONST.WEIGHTS = args.weights cfg.TRAIN.RESUME_TRAIN = True cfg.TRAIN.INITIAL_ITERATION = int(args.init_iter) # Print config print('Use config:') pprint(cfg) # Set GPU to use theano.gpuarray.use(cfg.CONST.DEVICE) # Start train/test process if not args.test: train_net(cfg) else: test_net(cfg)
def main(): # Get args from command line args = get_args_from_command_line() if args.gpu_id is not None: cfg.CONST.DEVICE = args.gpu_id if args.weights is not None: cfg.CONST.WEIGHTS = args.weights # Print config print('Use config:') pprint(cfg) # f_runner.write(str(cfg)) # Set GPU to use os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CONST.DEVICE # Start train/test process if not args.test and not args.inference: train_net(cfg) else: ''' if 'WEIGHTS' not in cfg.CONST or not os.path.exists(cfg.CONST.WEIGHTS): logging.error('Please specify the file path of checkpoint.') sys.exit(2) ''' if 'WEIGHTS' not in cfg.CONST: logging.error('Please specify the file path of checkpoint.1') sys.exit(2) if not os.path.exists(cfg.CONST.WEIGHTS): logging.error('Please specify the file path of checkpoint.2') sys.exit(2) if args.test: # test_net(cfg) path = '/raid/wuruihai/GRNet_FILES/tb_log' test_writer = SummaryWriter(path) test_net(cfg, test_writer=test_writer) if args.test_KITTI: path = '/raid/wuruihai/GRNet_FILES/tb_log' test_writer = SummaryWriter(path) test_net_KITTI(cfg, test_writer=test_writer) else: inference_net(cfg)
def main(): # Get args from command line args = get_args_from_command_line() # Read the experimental config exec(compile(open(args.cfg_file, "rb").read(), args.cfg_file, 'exec')) cfg = locals()['__C'] pprint(cfg) # Parse runtime arguments if args.gpu_id is not None: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id if not args.randomize: random.seed(cfg.CONST.RNG_SEED) np.random.seed(cfg.CONST.RNG_SEED) torch.manual_seed(cfg.CONST.RNG_SEED) torch.cuda.manual_seed(cfg.CONST.RNG_SEED) torch.cuda.manual_seed_all(cfg.CONST.RNG_SEED) # References: https://pytorch.org/docs/stable/notes/randomness.html torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.exp_name is not None: cfg.CONST.EXP_NAME = args.exp_name if args.weights is not None: cfg.CONST.WEIGHTS = args.weights # Start train/test process if not args.test and not args.inference: # Make sure cfg.TRAIN.NETWORK in ['RMNet', 'TinyFlowNet'] if cfg.TRAIN.NETWORK not in ['RMNet', 'TinyFlowNet']: logging.error( 'Please make sure cfg.TRAIN.NETWORK in ["RMNet", "TinyFlowNet"].' ) sys.exit(1) train_net(cfg) else: if 'WEIGHTS' not in cfg.CONST or not os.path.exists(cfg.CONST.WEIGHTS): logging.error('Please specify the file path of checkpoint.') sys.exit(2) if args.test: test_net(cfg) else: inference_net(cfg)
def main(): # Get args from command line args = get_args_from_command_line() if args.gpu_id is not None: cfg.CONST.DEVICE = args.gpu_id if not args.randomize: np.random.seed(cfg.CONST.RNG_SEED) if args.batch_size is not None: cfg.CONST.BATCH_SIZE = args.batch_size if args.epoch is not None: cfg.TRAIN.NUM_EPOCHES = args.epoch if args.dataset is not None: cfg.DATASET.DATASET_NAME = args.dataset if cfg.DATASET.DATASET_NAME not in cfg.DATASETS: cfg.DATASET.CENTER_BIAS = cfg.DATASETS.TEST.CENTER_BIAS else: cfg.DATASET.CENTER_BIAS = cfg.DATASETS.SALICON.CENTER_BIAS if args.out_path is not None: cfg.DIR.OUT_PATH = args.out_path if args.weights is not None: cfg.CONST.WEIGHTS = args.weights if not args.test: cfg.TRAIN.RESUME_TRAIN = True # print config print('Use config:') pprint(cfg) # Set GPU to use if type(cfg.CONST.DEVICE) == str: os.environ["CUDA_VISIBLE_DEVICES"] = cfg.CONST.DEVICE # Start train/test process if not args.test: train_net(cfg) else: if 'WEIGHTS' in cfg.CONST and os.path.exists(cfg.CONST.WEIGHTS): test_net(cfg) else: print('[FATAL] %s Please specify the file path of checkpoint.' % (dt.now())) sys.exit(2)
def train_net(cfg): # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use torch.backends.cudnn.benchmark = True # Set up data augmentation IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W train_transforms = utils.data_transforms.Compose([ utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground( cfg.TRAIN.RANDOM_BG_COLOR_RANGE), utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS, cfg.TRAIN.CONTRAST, cfg.TRAIN.SATURATION), utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.RandomFlip(), utils.data_transforms.RandomPermuteRGB(), utils.data_transforms.ToTensor(), ]) val_transforms = utils.data_transforms.Compose([ utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.ToTensor(), ]) # Set up data loader train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TRAIN_DATASET](cfg) val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TEST_DATASET](cfg) train_data_loader = torch.utils.data.DataLoader( dataset=train_dataset_loader.get_dataset( utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING, train_transforms), batch_size=cfg.CONST.BATCH_SIZE, num_workers=cfg.TRAIN.NUM_WORKER, pin_memory=True, shuffle=True, drop_last=True) val_data_loader = torch.utils.data.DataLoader( dataset=val_dataset_loader.get_dataset( utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING, val_transforms), batch_size=1, num_workers=1, pin_memory=True, shuffle=False) # Set up networks encoder = Encoder(cfg) decoder = Decoder(cfg) refiner = Refiner(cfg) merger = Merger(cfg) print('[DEBUG] %s Parameters in Encoder: %d.' % (dt.now(), utils.network_utils.count_parameters(encoder))) print('[DEBUG] %s Parameters in Decoder: %d.' % (dt.now(), utils.network_utils.count_parameters(decoder))) print('[DEBUG] %s Parameters in Refiner: %d.' % (dt.now(), utils.network_utils.count_parameters(refiner))) print('[DEBUG] %s Parameters in Merger: %d.' % (dt.now(), utils.network_utils.count_parameters(merger))) # Initialize weights of networks encoder.apply(utils.network_utils.init_weights) decoder.apply(utils.network_utils.init_weights) refiner.apply(utils.network_utils.init_weights) merger.apply(utils.network_utils.init_weights) # Set up solver if cfg.TRAIN.POLICY == 'adam': encoder_solver = torch.optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()), lr=cfg.TRAIN.ENCODER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) decoder_solver = torch.optim.Adam(decoder.parameters(), lr=cfg.TRAIN.DECODER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) refiner_solver = torch.optim.Adam(refiner.parameters(), lr=cfg.TRAIN.REFINER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) merger_solver = torch.optim.Adam(merger.parameters(), lr=cfg.TRAIN.MERGER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) elif cfg.TRAIN.POLICY == 'sgd': encoder_solver = torch.optim.SGD(filter(lambda p: p.requires_grad, encoder.parameters()), lr=cfg.TRAIN.ENCODER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) decoder_solver = torch.optim.SGD(decoder.parameters(), lr=cfg.TRAIN.DECODER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) refiner_solver = torch.optim.SGD(refiner.parameters(), lr=cfg.TRAIN.REFINER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) merger_solver = torch.optim.SGD(merger.parameters(), lr=cfg.TRAIN.MERGER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) else: raise Exception('[FATAL] %s Unknown optimizer %s.' % (dt.now(), cfg.TRAIN.POLICY)) # Set up learning rate scheduler to decay learning rates dynamically encoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( encoder_solver, milestones=cfg.TRAIN.ENCODER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) decoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( decoder_solver, milestones=cfg.TRAIN.DECODER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) refiner_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( refiner_solver, milestones=cfg.TRAIN.REFINER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) merger_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( merger_solver, milestones=cfg.TRAIN.MERGER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) if torch.cuda.is_available(): encoder = torch.nn.DataParallel(encoder).cuda() decoder = torch.nn.DataParallel(decoder).cuda() refiner = torch.nn.DataParallel(refiner).cuda() merger = torch.nn.DataParallel(merger).cuda() # Set up loss functions bce_loss = torch.nn.BCELoss() # Load pretrained model if exists init_epoch = 0 best_iou = -1 best_epoch = -1 if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN: print('[INFO] %s Recovering from %s ...' % (dt.now(), cfg.CONST.WEIGHTS)) checkpoint = torch.load(cfg.CONST.WEIGHTS) init_epoch = checkpoint['epoch_idx'] best_iou = checkpoint['best_iou'] best_epoch = checkpoint['best_epoch'] encoder.load_state_dict(checkpoint['encoder_state_dict']) decoder.load_state_dict(checkpoint['decoder_state_dict']) if cfg.NETWORK.USE_REFINER: refiner.load_state_dict(checkpoint['refiner_state_dict']) if cfg.NETWORK.USE_MERGER: merger.load_state_dict(checkpoint['merger_state_dict']) print('[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' \ % (dt.now(), init_epoch, best_iou, best_epoch)) # Summary writer for TensorBoard output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat()) log_dir = output_dir % 'logs' ckpt_dir = output_dir % 'checkpoints' train_writer = SummaryWriter(os.path.join(log_dir, 'train')) val_writer = SummaryWriter(os.path.join(log_dir, 'test')) # Training loop for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES): # Tick / tock epoch_start_time = time() # Batch average meterics batch_time = utils.network_utils.AverageMeter() data_time = utils.network_utils.AverageMeter() encoder_losses = utils.network_utils.AverageMeter() refiner_losses = utils.network_utils.AverageMeter() # Adjust learning rate encoder_lr_scheduler.step() decoder_lr_scheduler.step() refiner_lr_scheduler.step() merger_lr_scheduler.step() # switch models to training mode encoder.train() decoder.train() merger.train() refiner.train() batch_end_time = time() n_batches = len(train_data_loader) for batch_idx, (taxonomy_names, sample_names, rendering_images, ground_truth_volumes) in enumerate(train_data_loader): # Measure data time data_time.update(time() - batch_end_time) # Get data from data loader rendering_images = utils.network_utils.var_or_cuda( rendering_images) ground_truth_volumes = utils.network_utils.var_or_cuda( ground_truth_volumes) # Train the encoder, decoder, refiner, and merger image_features = encoder(rendering_images) raw_features, generated_volumes = decoder(image_features) if cfg.NETWORK.USE_MERGER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_MERGER: generated_volumes = merger(raw_features, generated_volumes) else: generated_volumes = torch.mean(generated_volumes, dim=1) encoder_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER: generated_volumes = refiner(generated_volumes) refiner_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 else: refiner_loss = encoder_loss # Gradient decent encoder.zero_grad() decoder.zero_grad() refiner.zero_grad() merger.zero_grad() if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER: encoder_loss.backward(retain_graph=True) refiner_loss.backward() else: encoder_loss.backward() encoder_solver.step() decoder_solver.step() refiner_solver.step() merger_solver.step() # Append loss to average metrics encoder_losses.update(encoder_loss.item()) refiner_losses.update(refiner_loss.item()) # Append loss to TensorBoard n_itr = epoch_idx * n_batches + batch_idx train_writer.add_scalar('EncoderDecoder/BatchLoss', encoder_loss.item(), n_itr) train_writer.add_scalar('Refiner/BatchLoss', refiner_loss.item(), n_itr) # Tick / tock batch_time.update(time() - batch_end_time) batch_end_time = time() print('[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' % \ (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, batch_idx + 1, n_batches, \ batch_time.val, data_time.val, encoder_loss.item(), refiner_loss.item())) # Append epoch loss to TensorBoard train_writer.add_scalar('EncoderDecoder/EpochLoss', encoder_losses.avg, epoch_idx + 1) train_writer.add_scalar('Refiner/EpochLoss', refiner_losses.avg, epoch_idx + 1) # Tick / tock epoch_end_time = time() print('[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time - epoch_start_time, \ encoder_losses.avg, refiner_losses.avg)) # Update Rendering Views if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING: n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING) train_data_loader.dataset.set_n_views_rendering(n_views_rendering) print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' % \ (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering)) # Validate the training models iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader, val_writer, encoder, decoder, refiner, merger) # Save weights to file if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) utils.network_utils.save_checkpoints(cfg, \ os.path.join(ckpt_dir, 'ckpt-epoch-%04d.pth' % (epoch_idx + 1)), \ epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, \ refiner, refiner_solver, merger, merger_solver, best_iou, best_epoch) if iou > best_iou: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) best_iou = iou best_epoch = epoch_idx + 1 utils.network_utils.save_checkpoints(cfg, \ os.path.join(ckpt_dir, 'best-ckpt.pth'), \ epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, \ refiner, refiner_solver, merger, merger_solver, best_iou, best_epoch) # Close SummaryWriter for TensorBoard train_writer.close() val_writer.close()
return args if __name__ == '__main__': args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) if args.exp_dir is not None: cfg.EXP_DIR = args.exp_dir cfg.GPU_ID = args.gpu_id print('Using config:') pprint.pprint(cfg) caffe.set_mode_gpu() caffe.set_device(args.gpu_id) output_dir_name = 'test' if args.datasets: output_dir_name += '_' + '_'.join(args.datasets) output_dir_name += '_' + datetime.datetime.now().strftime("%d_%m_%Y_%H_%M") output_dir = get_output_dir(output_dir_name, None) test_net(args.caffemodel, output_dir, args.datasets)
def train_net(cfg): # Set up data augmentation IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W train_transforms = utils.data_transforms.Compose([ utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground( cfg.TRAIN.RANDOM_BG_COLOR_RANGE), utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS, cfg.TRAIN.CONTRAST, cfg.TRAIN.SATURATION), utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.RandomFlip(), utils.data_transforms.RandomPermuteRGB(), utils.data_transforms.ToTensor(), ]) val_transforms = utils.data_transforms.Compose([ utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.ToTensor(), ]) # Set up data loader train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TRAIN_DATASET](cfg) val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TEST_DATASET](cfg) train_data_loader = paddle.io.DataLoader( dataset=train_dataset_loader.get_dataset( utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING, train_transforms), batch_size=cfg.CONST.BATCH_SIZE, #num_workers=0 , # cfg.TRAIN.NUM_WORKER>0时报错,因为dev/shm/太小 https://blog.csdn.net/ctypyb2002/article/details/107914643 #pin_memory=True, use_shared_memory=False, shuffle=True, drop_last=True) val_data_loader = paddle.io.DataLoader( dataset=val_dataset_loader.get_dataset( utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING, val_transforms), batch_size=1, #num_workers=1, #pin_memory=True, shuffle=False) # Set up networks # paddle.Model prepare fit save encoder = Encoder(cfg) decoder = Decoder(cfg) merger = Merger(cfg) refiner = Refiner(cfg) print('[DEBUG] %s Parameters in Encoder: %d.' % (dt.now(), utils.network_utils.count_parameters(encoder))) print('[DEBUG] %s Parameters in Decoder: %d.' % (dt.now(), utils.network_utils.count_parameters(decoder))) print('[DEBUG] %s Parameters in Merger: %d.' % (dt.now(), utils.network_utils.count_parameters(merger))) print('[DEBUG] %s Parameters in Refiner: %d.' % (dt.now(), utils.network_utils.count_parameters(refiner))) # # Initialize weights of networks # paddle的参数化不同,参见API # encoder.apply(utils.network_utils.init_weights) # decoder.apply(utils.network_utils.init_weights) # merger.apply(utils.network_utils.init_weights) # Set up learning rate scheduler to decay learning rates dynamically encoder_lr_scheduler = paddle.optimizer.lr.MultiStepDecay( learning_rate=cfg.TRAIN.ENCODER_LEARNING_RATE, milestones=cfg.TRAIN.ENCODER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA, verbose=True) decoder_lr_scheduler = paddle.optimizer.lr.MultiStepDecay( learning_rate=cfg.TRAIN.DECODER_LEARNING_RATE, milestones=cfg.TRAIN.DECODER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA, verbose=True) merger_lr_scheduler = paddle.optimizer.lr.MultiStepDecay( learning_rate=cfg.TRAIN.MERGER_LEARNING_RATE, milestones=cfg.TRAIN.MERGER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA, verbose=True) refiner_lr_scheduler = paddle.optimizer.lr.MultiStepDecay( learning_rate=cfg.TRAIN.REFINER_LEARNING_RATE, milestones=cfg.TRAIN.REFINER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA, verbose=True) # Set up solver # if cfg.TRAIN.POLICY == 'adam': encoder_solver = paddle.optimizer.Adam(learning_rate=encoder_lr_scheduler, parameters=encoder.parameters()) decoder_solver = paddle.optimizer.Adam(learning_rate=decoder_lr_scheduler, parameters=decoder.parameters()) merger_solver = paddle.optimizer.Adam(learning_rate=merger_lr_scheduler, parameters=merger.parameters()) refiner_solver = paddle.optimizer.Adam(learning_rate=refiner_lr_scheduler, parameters=refiner.parameters()) # if torch.cuda.is_available(): # encoder = torch.nn.DataParallel(encoder).cuda() # decoder = torch.nn.DataParallel(decoder).cuda() # merger = torch.nn.DataParallel(merger).cuda() # Set up loss functions bce_loss = paddle.nn.BCELoss() # Load pretrained model if exists init_epoch = 0 best_iou = -1 best_epoch = -1 if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN: print('[INFO] %s Recovering from %s ...' % (dt.now(), cfg.CONST.WEIGHTS)) # load encoder_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "encoder.pdparams")) encoder_solver_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "encoder_solver.pdopt")) encoder.set_state_dict(encoder_state_dict) encoder_solver.set_state_dict(encoder_solver_state_dict) decoder_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "decoder.pdparams")) decoder_solver_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "decoder_solver.pdopt")) decoder.set_state_dict(decoder_state_dict) decoder_solver.set_state_dict(decoder_solver_state_dict) if cfg.NETWORK.USE_MERGER: merger_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "merger.pdparams")) merger_solver_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "merger_solver.pdopt")) merger.set_state_dict(merger_state_dict) merger_solver.set_state_dict(merger_solver_state_dict) if cfg.NETWORK.USE_REFINER: refiner_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "refiner.pdparams")) refiner_solver_state_dict = paddle.load( os.path.join(cfg.CONST.WEIGHTS, "refiner_solver.pdopt")) refiner.set_state_dict(refiner_state_dict) refiner_solver.set_state_dict(refiner_solver_state_dict) print( '[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' % (dt.now(), init_epoch, best_iou, best_epoch)) # Summary writer for TensorBoard output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat()) log_dir = output_dir % 'logs' ckpt_dir = output_dir % 'checkpoints' # train_writer = SummaryWriter() # val_writer = SummaryWriter(os.path.join(log_dir, 'test')) train_writer = LogWriter(os.path.join(log_dir, 'train')) val_writer = LogWriter(os.path.join(log_dir, 'val')) # Training loop for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES): # Tick / tock epoch_start_time = time() # Batch average meterics batch_time = utils.network_utils.AverageMeter() data_time = utils.network_utils.AverageMeter() encoder_losses = utils.network_utils.AverageMeter() refiner_losses = utils.network_utils.AverageMeter() # # switch models to training mode encoder.train() decoder.train() merger.train() refiner.train() batch_end_time = time() n_batches = len(train_data_loader) # print("****debug: length of train data loder",n_batches) for batch_idx, (rendering_images, ground_truth_volumes) in enumerate( train_data_loader()): # # debug # if batch_idx>1: # break # Measure data time data_time.update(time() - batch_end_time) # print("****debug: batch_idx",batch_idx) # print(rendering_images.shape) # print(ground_truth_volumes.shape) # Get data from data loader rendering_images = utils.network_utils.var_or_cuda( rendering_images) ground_truth_volumes = utils.network_utils.var_or_cuda( ground_truth_volumes) # Train the encoder, decoder, and merger image_features = encoder(rendering_images) raw_features, generated_volumes = decoder(image_features) if cfg.NETWORK.USE_MERGER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_MERGER: generated_volumes = merger(raw_features, generated_volumes) # else: # mergered_volumes = paddle.mean(generated_volumes, aixs=1) encoder_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER: generated_volumes = refiner(generated_volumes) refiner_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 # else: # refiner_loss = encoder_loss # Gradient decent encoder_solver.clear_grad() decoder_solver.clear_grad() merger_solver.clear_grad() refiner_solver.clear_grad() if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER: encoder_loss.backward(retain_graph=True) refiner_loss.backward() # else: # encoder_loss.backward() encoder_solver.step() decoder_solver.step() merger_solver.step() refiner_solver.step() # Append loss to average metrics encoder_losses.update(encoder_loss.numpy()) refiner_losses.update(refiner_loss.numpy()) # Append loss to TensorBoard n_itr = epoch_idx * n_batches + batch_idx train_writer.add_scalar(tag='EncoderDecoder/BatchLoss', step=n_itr, value=encoder_loss.numpy()) train_writer.add_scalar('Refiner/BatchLoss', value=refiner_loss.numpy(), step=n_itr) # Tick / tock batch_time.update(time() - batch_end_time) batch_end_time = time() if (batch_idx % int(cfg.CONST.INFO_BATCH)) == 0: print( '[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, batch_idx + 1, n_batches, batch_time.val, data_time.val, encoder_loss.numpy(), refiner_loss.numpy())) # Append epoch loss to TensorBoard train_writer.add_scalar(tag='EncoderDecoder/EpochLoss', step=epoch_idx + 1, value=encoder_losses.avg) train_writer.add_scalar('Refiner/EpochLoss', value=refiner_losses.avg, step=epoch_idx + 1) # update scheduler each step encoder_lr_scheduler.step() decoder_lr_scheduler.step() merger_lr_scheduler.step() refiner_lr_scheduler.step() # Tick / tock epoch_end_time = time() print( '[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time - epoch_start_time, encoder_losses.avg, refiner_losses.avg)) # Update Rendering Views if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING: n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING) train_data_loader.dataset.set_n_views_rendering(n_views_rendering) print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' % (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering)) # Validate the training models iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader, val_writer, encoder, decoder, merger, refiner) # Save weights to file if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) utils.network_utils.save_checkpoints( cfg, os.path.join(ckpt_dir, 'ckpt-epoch-%04d' % (epoch_idx + 1)), epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, merger, merger_solver, refiner, refiner_solver, best_iou, best_epoch) if iou > best_iou: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) best_iou = iou best_epoch = epoch_idx + 1 utils.network_utils.save_checkpoints( cfg, os.path.join(ckpt_dir, 'best-ckpt'), epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, merger, merger_solver, refiner, refiner_solver, best_iou, best_epoch)
args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) if args.exp_dir is not None: cfg.EXP_DIR = args.exp_dir cfg.GPU_ID = args.gpu_id print('Using config:') pprint.pprint(cfg) caffe.set_mode_gpu() caffe.set_device(args.gpu_id) output_dir_name = 'test' if args.datasets: output_dir_name += '_' + '_'.join(args.datasets) output_dir_name += '_' + datetime.datetime.now().strftime("%d_%m_%Y_%H_%M") output_dir = get_output_dir(output_dir_name, None) test_net(args.caffemodel, output_dir, args.datasets)
def train_net(cfg): # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use torch.backends.cudnn.benchmark = True # Set up data loader # choose ShapeNet train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TRAIN_DATASET](cfg) test_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TEST_DATASET](cfg) # get_dataset's para: subdataset(train0, test1, val2) train_data_loader = torch.utils.data.DataLoader( dataset=train_dataset_loader.get_dataset( utils.data_loaders.DatasetSubset.TRAIN), # train/test/val batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=cfg.CONST.NUM_WORKERS, collate_fn=utils.data_loaders.collate_fn, pin_memory=True, shuffle=True, drop_last=True) val_data_loader = torch.utils.data.DataLoader( dataset=test_dataset_loader.get_dataset( utils.data_loaders.DatasetSubset.VAL), batch_size=1, num_workers=cfg.CONST.NUM_WORKERS, collate_fn=utils.data_loaders.collate_fn, pin_memory=True, shuffle=False) # Set up folders for logs and checkpoints output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', datetime.now().isoformat()) # output_dir cfg.DIR.CHECKPOINTS = output_dir % 'checkpoints' cfg.DIR.LOGS = output_dir % 'logs' txt_dir = output_dir % 'txt' if not os.path.exists(txt_dir): os.makedirs(txt_dir) f_record = open(txt_dir + '/record.txt', 'w') if not os.path.exists(cfg.DIR.CHECKPOINTS): os.makedirs(cfg.DIR.CHECKPOINTS) # Create tensorboard writers train_writer = SummaryWriter(os.path.join(cfg.DIR.LOGS, 'train')) val_writer = SummaryWriter(os.path.join(cfg.DIR.LOGS, 'test')) # Create the networks grnet = GRNet(cfg) grnet.apply(utils.helpers.init_weights) logging.debug('Parameters in GRNet: %d.' % utils.helpers.count_parameters(grnet)) # Move the network to GPU if possible if torch.cuda.is_available(): grnet = torch.nn.DataParallel(grnet).cuda() # Create the optimizers grnet_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, grnet.parameters()), lr=cfg.TRAIN.LEARNING_RATE, weight_decay=cfg.TRAIN.WEIGHT_DECAY, betas=cfg.TRAIN.BETAS) grnet_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( grnet_optimizer, milestones=cfg.TRAIN.LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) # Set up loss functions chamfer_dist = ChamferDistance() gridding_loss = GriddingLoss( # lgtm [py/unused-local-variable] scales=cfg.NETWORK.GRIDDING_LOSS_SCALES, alphas=cfg.NETWORK.GRIDDING_LOSS_ALPHAS) # Load pretrained model if exists init_epoch = 0 # 断点续跑 best_metrics = None if 'WEIGHTS' in cfg.CONST: logging.info('Recovering from %s ...' % (cfg.CONST.WEIGHTS)) checkpoint = torch.load(cfg.CONST.WEIGHTS) best_metrics = Metrics(cfg.TEST.METRIC_NAME, checkpoint['best_metrics']) grnet.load_state_dict(checkpoint['grnet']) logging.info( 'Recover complete. Current epoch = #%d; best metrics = %s.' % (init_epoch, best_metrics)) # Training/Testing the network first_epoch = True for epoch_idx in range(init_epoch + 1, cfg.TRAIN.N_EPOCHS + 1): epoch_start_time = time() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter(['SparseLoss', 'DenseLoss']) # losses = AverageMeter(['GridLoss', 'DenseLoss']) grnet.train() batch_end_time = time() n_batches = len(train_data_loader) for batch_idx, (taxonomy_ids, model_ids, data) in enumerate(train_data_loader): # print('batch_size: ', data['partial_cloud'].shape) data_time.update(time() - batch_end_time) for k, v in data.items(): data[k] = utils.helpers.var_or_cuda(v) sparse_ptcloud, dense_ptcloud = grnet(data) sparse_loss = chamfer_dist(sparse_ptcloud, data['gtcloud']) # grid_loss = gridding_loss(dense_ptcloud, data['gtcloud']) dense_loss = chamfer_dist(dense_ptcloud, data['gtcloud']) _loss = sparse_loss + dense_loss losses.update( [sparse_loss.item() * 1000, dense_loss.item() * 1000]) # _loss = grid_loss + dense_loss # losses.update([grid_loss.item() * 1000, dense_loss.item() * 1000]) grnet.zero_grad() _loss.backward() grnet_optimizer.step() n_itr = (epoch_idx - 1) * n_batches + batch_idx train_writer.add_scalar('Loss/Batch/Sparse', sparse_loss.item() * 1000, n_itr) # train_writer.add_scalar('Loss/Batch/Grid', grid_loss.item() * 1000, n_itr) train_writer.add_scalar('Loss/Batch/Dense', dense_loss.item() * 1000, n_itr) batch_time.update(time() - batch_end_time) batch_end_time = time() ### f_record.write( '\n[Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) Losses = %s' % (epoch_idx, cfg.TRAIN.N_EPOCHS, batch_idx + 1, n_batches, batch_time.val(), data_time.val(), ['%.4f' % l for l in losses.val()])) logging.info( '[Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) Losses = %s' % (epoch_idx, cfg.TRAIN.N_EPOCHS, batch_idx + 1, n_batches, batch_time.val(), data_time.val(), ['%.4f' % l for l in losses.val()])) grnet_lr_scheduler.step() epoch_end_time = time() train_writer.add_scalar('Loss/Epoch/Sparse', losses.avg(0), epoch_idx) # train_writer.add_scalar('Loss/Epoch/Grid', losses.avg(0), epoch_idx) train_writer.add_scalar('Loss/Epoch/Dense', losses.avg(1), epoch_idx) f_record.write('\n[Epoch %d/%d] EpochTime = %.3f (s) Losses = %s' % (epoch_idx, cfg.TRAIN.N_EPOCHS, epoch_end_time - epoch_start_time, ['%.4f' % l for l in losses.avg()])) logging.info('[Epoch %d/%d] EpochTime = %.3f (s) Losses = %s' % (epoch_idx, cfg.TRAIN.N_EPOCHS, epoch_end_time - epoch_start_time, ['%.4f' % l for l in losses.avg()])) # Validate the current model # if epoch_idx % cfg.TRAIN.SAVE_FREQ == 0: # metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer, grnet) # Save ckeckpoints # if epoch_idx % cfg.TRAIN.SAVE_FREQ == 0 or metrics.better_than(best_metrics): if first_epoch: metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer, grnet) best_metrics = metrics first_epoch = False if epoch_idx % cfg.TRAIN.SAVE_FREQ == 0: metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer, grnet) file_name = 'best-ckpt.pth' if metrics.better_than( best_metrics) else 'epoch-%03d.pth' % (epoch_idx + 1) output_path = os.path.join(cfg.DIR.CHECKPOINTS, file_name) torch.save({ 'epoch_index': epoch_idx, 'best_metrics': metrics.state_dict(), 'grnet': grnet.state_dict() }, output_path) # yapf: disable logging.info('Saved checkpoint to %s ...' % output_path) if metrics.better_than(best_metrics): best_metrics = metrics train_writer.close() val_writer.close()
# Written by Ting Pan # -------------------------------------------------------- import dragon.vm.caffe as caffe from datasets.factory import get_imdb from core.test import test_net from config import cfg import time, os cfg.DATA_DIR = '/home/workspace/datasets/UA-DETRAC' imdb_name = 'detrac_2017_test' gpu_id = 1 prototxt = 'models/detrac/AirNet/deploy.prototxt' caffemodel = 'checkpoints/airnet_final.caffemodel' vis = False if __name__ == '__main__': while not os.path.exists(caffemodel): print('Waiting for {} to exist...'.format(caffemodel)) time.sleep(10) caffe.set_mode_gpu() caffe.set_device(gpu_id) net = caffe.Net(prototxt, caffemodel, caffe.TEST) net.name = os.path.splitext(os.path.basename(caffemodel))[0] imdb = get_imdb(imdb_name) test_net(net, imdb, thresh=cfg.TEST.THRESH, vis=vis)
print( "[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f RLoss = %.4f" % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, time.time() - epoch_start_time, encoder_losses.avg, refiner_losses.avg)) # Update Rendering Views if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING: #using random number of views to train the net n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING) train_data_loader.dataset.set_n_views_rendering(n_views_rendering) print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' % (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering)) # Validate the training models iou = test_net(cfg, epoch_idx + 1, None, val_data_loader, None, encoder, decoder, refiner, merger) if epoch_idx == 0: best_iou = 0.0 best_epoch = 1 if iou > best_iou: best_iou = iou best_epoch = epoch_idx + 1 if iou > best_iou * 0.85: volume, _ = forward( encoder, decoder, merger, refiner, val_dataset[0][1].expand_dims(axis=0).as_in_context(ctx)) #if current iou is bigger than 85% of best iou, generate the 3D model and save it in fold generated_models_with_refiner. utils.binvox_visualization.get_volume_views( volume, "/home/hzx/my pix2vox model3/generated_models_with_refiner", epoch_idx)
print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.GPU_ID = args.gpu_id print('Using config:') pprint.pprint(cfg) while not os.path.exists(args.caffemodel) and args.wait: print('Waiting for {} to exist...'.format(args.caffemodel)) time.sleep(10) caffe.set_mode_gpu() caffe.set_device(args.gpu_id) net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] print(args.imdb_name) imdb = get_repo_imdb(args.imdb_name) imdb.competition_mode(args.comp_mode) if not cfg.TEST.OBJ_DET.HAS_RPN: imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD) test_net(net, imdb, max_per_image=args.max_per_image, vis=args.vis)
print(args.imdb_name) imdb = get_repo_imdb(args.imdb_name) imdb.competition_mode(args.comp_mode) if not cfg.TEST.OBJ_DET.HAS_RPN and cfg.TASK == 'object_detection': imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD) al_net = None if args.al_net is not None and args.al_def is not None: al_net = caffe.Net(args.al_def, caffe.TEST, weights=args.al_net) al_net.name = "al_" + os.path.splitext(os.path.basename( args.al_def))[0] test_net(net, imdb, max_per_image=args.max_per_image, vis=args.vis, al_net=al_net) ''' argparse.ArgumentParser Input: (description='Test an Object Detection network'), Output: parser caffe.Net input: (args.prototxt, args.caffemodel, caffe.TEST), Output: net os.path.splitext input: (os.path.basename(args.caffemodel)), output: net.name get_repo_imdb input: (args.imdb_name), output: imdb '''
def train_net(cfg): # Set up data loader train_data_loader = torch.utils.data.DataLoader( dataset=utils.data_loaders.DatasetCollector.get_dataset( cfg, cfg.DATASET.TRAIN_DATASET, utils.data_loaders.DatasetSubset.TRAIN), batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=cfg.CONST.N_WORKERS, pin_memory=True, shuffle=True, drop_last=True) val_data_loader = torch.utils.data.DataLoader( dataset=utils.data_loaders.DatasetCollector.get_dataset( cfg, cfg.DATASET.TEST_DATASET, utils.data_loaders.DatasetSubset.VAL), batch_size=1, num_workers=cfg.CONST.N_WORKERS, pin_memory=True, shuffle=False) # Set up networks tflownet = TinyFlowNet(cfg) rmnet = RMNet(cfg) tflownet.apply(utils.helpers.init_weights) rmnet.kv_memory.apply(utils.helpers.init_weights) rmnet.kv_query.apply(utils.helpers.init_weights) rmnet.decoder.apply(utils.helpers.init_weights) logging.info('Parameters in TinyFlowNet: %d.' % (utils.helpers.count_parameters(tflownet))) logging.info('Parameters in RMNet: %d.' % (utils.helpers.count_parameters(rmnet))) # Move the network to GPU if possible if torch.cuda.is_available(): if torch.__version__ >= '1.2.0' and cfg.TRAIN.USE_BATCH_NORM: torch.distributed.init_process_group( 'nccl', init_method='file:///tmp/rmnet-%s' % uuid.uuid4().hex, world_size=1, rank=0) tflownet = torch.nn.SyncBatchNorm.convert_sync_batchnorm(tflownet) rmnet = torch.nn.SyncBatchNorm.convert_sync_batchnorm(rmnet) tflownet = torch.nn.DataParallel(tflownet).cuda() rmnet = torch.nn.DataParallel(rmnet).cuda() # Create the optimizers network = rmnet if cfg.TRAIN.NETWORK == 'RMNet' else tflownet optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, network.parameters()), lr=cfg.TRAIN.LEARNING_RATE, weight_decay=cfg.TRAIN.WEIGHT_DECAY, betas=cfg.TRAIN.BETAS) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, cfg.TRAIN.N_EPOCHS) # Set up loss functions l1_loss = torch.nn.L1Loss() nll_loss = torch.nn.NLLLoss(ignore_index=cfg.CONST.IGNORE_IDX) lovasz_loss = LovaszLoss(ignore_index=cfg.CONST.IGNORE_IDX) # Load the pretrained model if exists init_epoch = 0 best_metrics = None METRICS_THRESHOLD = Metrics( cfg.TEST.MAIN_METRIC_NAME, [cfg.TRAIN.CKPT_SAVE_THRESHOLD for i in range(len(Metrics.names()))]) if 'WEIGHTS' in cfg.CONST: logging.info('Recovering from %s ...' % (cfg.CONST.WEIGHTS)) checkpoint = torch.load(cfg.CONST.WEIGHTS) best_metrics = Metrics(cfg.TEST.MAIN_METRIC_NAME, checkpoint['best_metrics']) tflownet.load_state_dict(checkpoint['tflownet']) rmnet.load_state_dict(checkpoint['rmnet']) logging.info( 'Recover completed. Current epoch = #%d; best metrics = %s.' % (init_epoch, best_metrics)) # Set up folders for logs, snapshot and checkpoints output_dir = os.path.join(cfg.DIR.OUTPUT_DIR, '%s', cfg.CONST.EXP_NAME) cfg.DIR.CHECKPOINTS = output_dir % 'checkpoints' cfg.DIR.LOGS = output_dir % 'logs' if not os.path.exists(cfg.DIR.CHECKPOINTS): os.makedirs(cfg.DIR.CHECKPOINTS) # Create tensorboard writers train_writer = SummaryWriter(cfg, 'train') val_writer = SummaryWriter(cfg, 'test') # Backup current code snapshot cfg.DIR.SNAPSHOTS = os.path.join(cfg.DIR.OUTPUT_DIR, 'snapshots') if not os.path.exists(cfg.DIR.SNAPSHOTS): os.makedirs(cfg.DIR.SNAPSHOTS) with zipfile.ZipFile( os.path.join(cfg.DIR.SNAPSHOTS, '%s.zip' % cfg.CONST.EXP_NAME), 'w') as zf: root_dir = os.getcwd() for dirname, subdirs, files in os.walk(root_dir): if os.path.normpath(dirname).find( os.path.normpath(cfg.DIR.OUTPUT_DIR)) != -1: continue _dirname = os.path.relpath(dirname, root_dir) zf.write(_dirname) for filename in files: zf.write(os.path.join(_dirname, filename)) # Training/Testing the network n_batches = len(train_data_loader) last_epoch_idx_keep_frame_steps = -cfg.TRAIN.N_EPOCHS for epoch_idx in range(init_epoch + 1, cfg.TRAIN.N_EPOCHS + 1): epoch_start_time = time() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() if cfg.TRAIN.USE_BATCH_NORM: tflownet.train() rmnet.train() else: tflownet.eval() rmnet.eval() # Update frame step if cfg.TRAIN.USE_RANDOM_FRAME_STEPS: if epoch_idx >= cfg.TRAIN.EPOCH_INDEX_FIXING_FRAME_STEPS and \ epoch_idx <= last_epoch_idx_keep_frame_steps + cfg.TRAIN.N_EPOCHS_KEEP_FRAME_STEPS: # Keep the frame step == 1 when JF Mean exceed a threshold for several epochs max_frame_steps = 1 else: max_frame_steps = random.randint( 1, min(cfg.TRAIN.MAX_FRAME_STEPS, epoch_idx // 5 + 2)) train_data_loader.dataset.set_frame_step( random.randint(1, max_frame_steps)) logging.info('[Epoch %d/%d] Set frame step to %d' % (epoch_idx, cfg.TRAIN.N_EPOCHS, train_data_loader.dataset.frame_step)) batch_end_time = time() for batch_idx, (video_name, n_objects, frames, masks, optical_flows) in enumerate(train_data_loader): n_itr = (epoch_idx - 1) * n_batches + batch_idx data_time.update(time() - batch_end_time) try: frames = utils.helpers.var_or_cuda(frames) masks = utils.helpers.var_or_cuda(masks) optical_flows = utils.helpers.var_or_cuda(optical_flows) est_flows = tflownet(frames) est_flows = utils.helpers.var_or_cuda(est_flows) est_probs = rmnet(frames, masks, optical_flows, n_objects, cfg.TRAIN.MEMORIZE_EVERY) est_probs = utils.helpers.var_or_cuda( est_probs[:, 1:]).permute(0, 2, 1, 3, 4) masks = torch.argmax(masks[:, 1:], dim=2) if cfg.TRAIN.NETWORK == 'TinyFlowNet': loss = l1_loss(est_flows, optical_flows) else: # RMNet loss = lovasz_loss(est_probs, masks) + nll_loss( torch.log(est_probs), masks) losses.update(loss.item()) tflownet.zero_grad() rmnet.zero_grad() loss.backward() optimizer.step() except Exception as ex: logging.exception(ex) continue train_writer.add_scalar('Loss/Batch', loss.item(), n_itr) batch_time.update(time() - batch_end_time) batch_end_time = time() logging.info( '[Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) Loss = %.4f' % (epoch_idx, cfg.TRAIN.N_EPOCHS, batch_idx + 1, n_batches, batch_time.val(), data_time.val(), losses.val())) lr_scheduler.step() epoch_end_time = time() train_writer.add_scalar('Loss/Epoch', losses.avg(), epoch_idx) logging.info('[Epoch %d/%d] EpochTime = %.3f (s) Loss = %.4f' % (epoch_idx, cfg.TRAIN.N_EPOCHS, epoch_end_time - epoch_start_time, losses.avg())) # Evaluate the current model metrics = test_net(cfg, epoch_idx, val_data_loader, val_writer, tflownet, rmnet) if metrics.state_dict( )[cfg.TEST.MAIN_METRIC_NAME] > cfg.TRAIN.KEEP_FRAME_STEPS_THRESHOLD: last_epoch_idx_keep_frame_steps = epoch_idx # Save ckeckpoints if epoch_idx % cfg.TRAIN.CKPT_SAVE_FREQ == 0 and metrics.better_than( METRICS_THRESHOLD): output_path = os.path.join(cfg.DIR.CHECKPOINTS, 'ckpt-epoch-%03d.pth' % epoch_idx) torch.save({ 'epoch_index': epoch_idx, 'best_metrics': metrics.state_dict(), 'tflownet': tflownet.state_dict(), 'rmnet': rmnet.state_dict() }, output_path) # yapf: disable logging.info('Saved checkpoint to %s ...' % output_path) if metrics.better_than(best_metrics): output_path = os.path.join(cfg.DIR.CHECKPOINTS, 'ckpt-best.pth') best_metrics = metrics torch.save({ 'epoch_index': epoch_idx, 'best_metrics': metrics.state_dict(), 'tflownet': tflownet.state_dict(), 'rmnet': rmnet.state_dict() }, output_path) # yapf: disable logging.info('Saved checkpoint to %s ...' % output_path) train_writer.close() val_writer.close()
def train_net(cfg): # Set up data augmentation IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W train_transforms = utils.data_transforms.Compose([ utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TRAIN.RANDOM_BG_COLOR_RANGE), utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS, cfg.TRAIN.CONTRAST, cfg.TRAIN.SATURATION), utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.RandomFlip(), utils.data_transforms.RandomPermuteRGB(), utils.data_transforms.ToTensor(), ]) val_transforms = utils.data_transforms.Compose([ utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.ToTensor(), ]) # Set up data loader train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[cfg.DATASET.TRAIN_DATASET](cfg) val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[cfg.DATASET.TEST_DATASET](cfg) train_data_loader = paddle.io.DataLoader(dataset=train_dataset_loader.get_dataset( utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING, train_transforms), batch_size=cfg.CONST.BATCH_SIZE, #num_workers=0 , # cfg.TRAIN.NUM_WORKER>0时报错,因为dev/shm/太小 https://blog.csdn.net/ctypyb2002/article/details/107914643 #pin_memory=True, use_shared_memory=False, shuffle=True, drop_last=True) val_data_loader = paddle.io.DataLoader(dataset=val_dataset_loader.get_dataset( utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING, val_transforms), batch_size=1, #num_workers=1, #pin_memory=True, shuffle=False) # Set up networks # paddle.Model prepare fit save res_gru_net = Res_Gru_Net(cfg) print('[DEBUG] %s Parameters in Merger: %d.' % (dt.now(), utils.network_utils.count_parameters(res_gru_net))) # Set up learning rate scheduler to decay learning rates dynamically res_gru_net_lr_scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=cfg.TRAIN.RES_GRU_NET_LEARNING_RATE, milestones=cfg.TRAIN.RES_GRU_NET_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA, verbose=True) # Set up solver # if cfg.TRAIN.POLICY == 'adam': res_gru_net_solver = paddle.optimizer.Adam(learning_rate=res_gru_net_lr_scheduler, parameters=res_gru_net.parameters()) # Set up loss functions bce_loss = paddle.nn.BCELoss() # Load pretrained model if exists init_epoch = 0 best_iou = -1 best_epoch = -1 if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN: print('[INFO] %s Recovering from %s ...' % (dt.now(), cfg.CONST.WEIGHTS)) # load res_gru_net_state_dict = paddle.load(os.path.join(cfg.CONST.WEIGHTS, "res_gru_net.pdparams")) res_gru_net_solver_state_dict = paddle.load(os.path.join(cfg.CONST.WEIGHTS, "res_gru_net_solver.pdopt")) res_gru_net.set_state_dict(res_gru_net_state_dict) res_gru_net_solver.set_state_dict(res_gru_net_solver_state_dict) print('[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' % (dt.now(), init_epoch, best_iou, best_epoch)) # Summary writer for TensorBoard output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat()) log_dir = output_dir % 'logs' ckpt_dir = output_dir % 'checkpoints' # train_writer = SummaryWriter() # val_writer = SummaryWriter(os.path.join(log_dir, 'test')) train_writer=LogWriter(os.path.join(log_dir, 'train')) val_writer=LogWriter(os.path.join(log_dir, 'val')) # Training loop for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES): # Tick / tock epoch_start_time = time() # Batch average meterics batch_time = utils.network_utils.AverageMeter() data_time = utils.network_utils.AverageMeter() res_gru_net_losses = utils.network_utils.AverageMeter() # # switch models to training mode res_gru_net.train() batch_end_time = time() n_batches = len(train_data_loader) for batch_idx, (rendering_images, ground_truth_volumes) in enumerate(train_data_loader()): # if batch_idx>1: # exit() # Measure data time data_time.update(time() - batch_end_time) rendering_images = rendering_images.cuda() ground_truth_volumes = ground_truth_volumes.cuda() # print(rendering_images.shape) # print(ground_truth_volumes.shape) # [64, 5, 3, 224, 224] # [64, 32, 32, 32] # print("ground_truth_volumes", ground_truth_volumes) # Train the res_gru_net generated_volumes = res_gru_net(rendering_images) # print("generated_volumes", generated_volumes) res_gru_net_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 res_gru_net_loss.backward() res_gru_net_solver.step() # Gradient decent res_gru_net_solver.clear_grad () # Append loss to average metrics res_gru_net_losses.update(res_gru_net_loss) # Append loss to TensorBoard n_itr = epoch_idx * n_batches + batch_idx train_writer.add_scalar(tag='Res_Gru_Net/BatchLoss', step=n_itr, value=res_gru_net_loss) # Tick / tock batch_time.update(time() - batch_end_time) batch_end_time = time() n_batches = len(train_data_loader) if (batch_idx % int(cfg.CONST.INFO_BATCH )) == 0: print('[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, batch_idx + 1, n_batches, batch_time.val, data_time.val, res_gru_net_loss)) # Append epoch loss to TensorBoard train_writer.add_scalar(tag='Res_Gru_Net/EpochLoss', step=epoch_idx + 1, value=res_gru_net_losses.avg) # Tick / tock epoch_end_time = time() print('[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time - epoch_start_time, res_gru_net_losses.avg)) # Update Rendering Views if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING: n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING) train_data_loader.dataset.set_n_views_rendering(n_views_rendering) print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' % (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering)) # Validate the training models iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader, val_writer, res_gru_net) # Save weights to file if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) utils.network_utils.save_checkpoints(cfg, os.path.join(ckpt_dir, 'ckpt-epoch-%04d' % (epoch_idx + 1)), epoch_idx + 1, res_gru_net, res_gru_net_solver, best_iou, best_epoch) if iou > best_iou: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) best_iou = iou best_epoch = epoch_idx + 1 utils.network_utils.save_checkpoints(cfg, os.path.join(ckpt_dir, 'best-ckpt'), epoch_idx + 1, res_gru_net, res_gru_net_solver, best_iou, best_epoch)