def train_model(): """Model training loop.""" model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) writer = SummaryWriter(log_dir=output_dir) training_stats = TrainingStats(model, writer) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): if model.roi_data_loader.has_stopped(): handle_critical_error(model, 'roi_data_loader failed') training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats(cur_iter) training_stats.LogIterStats(cur_iter, lr) writer.add_scalar('learning_rate', lr, cur_iter) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # save train loss and metric state_file = os.path.join(output_dir, 'training_state.json') training_stats.SaveTrainingStates(state_file) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def main(): # Initialize C2 workspace.GlobalInit( ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1']) # Set up logging and load config options logger = setup_logging(__name__) logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO) args = parse_args() logger.info('Called with args:') logger.info(args) if args.cfg_file is not None: merge_cfg_from_file(args.cfg_file) if args.opts is not None: merge_cfg_from_list(args.opts) assert_and_infer_cfg() smi_output, cuda_ver, cudnn_ver = c2_utils.get_nvidia_info() logger.info("cuda version : {}".format(cuda_ver)) logger.info("cudnn version: {}".format(cudnn_ver)) logger.info("nvidia-smi output:\n{}".format(smi_output)) logger.info('Training with config:') logger.info(pprint.pformat(cfg)) # Note that while we set the numpy random seed network training will not be # deterministic in general. There are sources of non-determinism that cannot # be removed with a reasonble execution-speed tradeoff (such as certain # non-deterministic cudnn functions). np.random.seed(cfg.RNG_SEED) # test model logger.info("creat test model ...") test_model = test_engine.initialize_model_from_cfg(cfg.TEST.WEIGHTS, gpu_id=0) logger.info("created test model ...") train_data = DataLoader(root, "train_id.txt", cfg, test_model, is_train=True) # creat mode model, weights_file, start_iter, checkpoints = create_model( True, cfg, output_dir) # test blob print(workspace.Blobs()) # create input blob blob_names = ['data_stage2', 'gt_label_stage2'] for gpu_id in range(cfg.NUM_GPUS): with c2_utils.NamedCudaScope(gpu_id): for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name)) # Override random weight initialization with weights from a saved model if weights_file: nu.initialize_gpu_from_weights_file(model, weights_file, gpu_id=0) # Even if we're randomly initializing we still need to synchronize # parameters across GPUs nu.broadcast_parameters(model) workspace.CreateNet(model.net) logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir))) dump_proto_files(model, output_dir) writer = SummaryWriter(log_dir=output_dir) training_stats = TrainingStats(model, writer) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) logger.info("start train ...") for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): # feed data # print("{} iter starting feed data...".format(cur_iter)) data_stage2, gt_label = train_data.next_batch() with c2_utils.NamedCudaScope(gpu_id): workspace.FeedBlob(core.ScopedName('data_stage2'), data_stage2) workspace.FeedBlob(core.ScopedName('gt_label_stage2'), gt_label) # print("workspace.RunNet(model.net.Proto().name)") training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats(cur_iter) training_stats.LogIterStats(cur_iter, lr) writer.add_scalar('learning_rate', lr, cur_iter) # print("end of RunNet") if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # save train loss and metric state_file = os.path.join(output_dir, 'training_state.json') training_stats.SaveTrainingStates(state_file) # Execute the training run checkpoints = detectron.utils.train.train_model() # Test the trained model if not args.skip_test: test_model(checkpoints['final'], args.multi_gpu_testing, args.opts)