def test_restore_checkpoint(): # Create Model model = model_builder.create(cfg.MODEL.TYPE, train=True) add_momentum_init_ops(model) init_weights(model) # Fill input blobs roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES ) model_builder.add_training_inputs(model, roidb=roidb) workspace.CreateNet(model.net) # Bookkeeping for checkpoint creation iter_num = 0 checkpoints = {} output_dir = get_output_dir(cfg.TRAIN.DATASETS, training=True) chk_file_path = os.path.join(output_dir, 'model_iter{}.pkl'.format(iter_num)) checkpoints[iter_num] = chk_file_path # Save model weights nu.save_model_to_weights_file(checkpoints[iter_num], model) orig_gpu_0_params, orig_all_params = get_params(model) # Change the model weights init_weights(model) # Reload the weights in the model nu.initialize_gpu_from_weights_file(model, chk_file_path, gpu_id=0) nu.broadcast_parameters(model) shutil.rmtree(cfg.OUTPUT_DIR) _, restored_all_params = get_params(model) # Check if all params are loaded correctly for scoped_name, blob in orig_all_params.items(): np.testing.assert_array_equal(blob, restored_all_params[scoped_name]) # Check if broadcast_parameters works for scoped_name, blob in restored_all_params.items(): unscoped_name = c2_utils.UnscopeName(scoped_name) np.testing.assert_array_equal(blob, orig_gpu_0_params[unscoped_name])
def main(opts): logger = logging.getLogger(__name__) roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) logger.info('{:d} roidb entries'.format(len(roidb))) roi_data_loader = RoIDataLoader( roidb, num_loaders=cfg.DATA_LOADER.NUM_THREADS, minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE, blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY ) blob_names = roi_data_loader.get_output_names() net = core.Net('dequeue_net') net.type = 'dag' all_blobs = [] for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): for blob_name in blob_names: blob = core.ScopedName(blob_name) all_blobs.append(blob) workspace.CreateBlob(blob) logger.info('Creating blob: {}'.format(blob)) net.DequeueBlobs( roi_data_loader._blobs_queue_name, blob_names) logger.info("Protobuf:\n" + str(net.Proto())) if opts.profiler: import cProfile cProfile.runctx( 'loader_loop(roi_data_loader)', globals(), locals(), sort='cumulative') else: loader_loop(roi_data_loader) roi_data_loader.register_sigint_handler() roi_data_loader.start(prefill=True) total_time = 0 for i in range(opts.num_batches): start_t = time.time() for _ in range(opts.x_factor): workspace.RunNetOnce(net) total_time += (time.time() - start_t) / opts.x_factor logger.info( '{:d}/{:d}: Averge dequeue time: {:.3f}s [{:d}/{:d}]'.format( i + 1, opts.num_batches, total_time / (i + 1), roi_data_loader._minibatch_queue.qsize(), cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE ) ) # Sleep to simulate the time taken by running a little network time.sleep(opts.sleep_time) # To inspect: # blobs = workspace.FetchBlobs(all_blobs) # from IPython import embed; embed() logger.info('Shutting down data loader...') roi_data_loader.shutdown()
def add_model_training_inputs(model): """Load the training dataset and attach the training inputs to the model.""" logger = logging.getLogger(__name__) logger.info('Loading dataset: {}'.format(cfg.TRAIN.DATASETS)) roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES ) logger.info('{:d} roidb entries'.format(len(roidb))) model_builder.add_training_inputs(model, roidb=roidb)