def main(args): device_opts = deviceOpts() workspace.CreateBlob('conv4_norm') workspace.CreateBlob('data') init_def = initNet(args.init_net, device_opts) net_def = createNet(args.pred_net, device_opts, use_cudnn=args.cudnn) print net_def
def testSparse(self): # to test duplicated indices we assign two indices to each weight and # thus each weight might count once or twice DUPLICATION = 2 perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32) np.random.seed(123) # make test deterministic data = np.random.randint( 2, size=(20, perfect_model.size * DUPLICATION)).astype(np.float32) label = np.dot(data, np.repeat(perfect_model, DUPLICATION)) model = cnn.CNNModelHelper("NCHW", name="test") # imitate what model wrapper does w = model.param_init_net.ConstantFill( [], 'w', shape=[perfect_model.size], value=0.0) model.params.append(w) picked = model.net.Gather([w, 'indices'], 'gather') out = model.ReduceFrontSum(picked, 'sum') sq = model.SquaredL2Distance([out, 'label']) loss = model.AveragedLoss(sq, "avg_loss") grad_map = model.AddGradientOperators([loss]) self.assertIsInstance(grad_map['w'], core.GradientSlice) optimizer = self.build_optimizer(model) workspace.CreateBlob('indices') workspace.CreateBlob('label') for indices_type in [np.int32, np.int64]: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net, True) for _ in range(2000): idx = np.random.randint(data.shape[0]) # transform into indices of binary features indices = np.repeat(np.arange(perfect_model.size), DUPLICATION)[data[idx] == 1] if indices.size == 0: continue workspace.FeedBlob( 'indices', indices.reshape((indices.size,)).astype(indices_type) ) workspace.FeedBlob('label', np.array(label[idx]).astype(np.float32)) workspace.RunNet(model.net.Proto().name) np.testing.assert_allclose( perfect_model, workspace.FetchBlob('w'), atol=1e-2 ) self.check_optimizer(optimizer)
def create_enqueue_blobs(self): """create enqueue blobs""" blob_names = self.get_output_names() enqueue_blob_names = [ '{}_enqueue_{}'.format(b, self._loader_id) for b in blob_names ] for gpu_id in range(self._num_gpus): with c2_utils.NamedCudaScope(gpu_id): for blob in enqueue_blob_names: workspace.CreateBlob(core.ScopedName(blob)) if self._num_gpus == 0: for blob in enqueue_blob_names: workspace.CreateBlob(core.ScopedName(blob)) return enqueue_blob_names
def test_last_n_window_ops_shape_inference_4d_input(self): input_shape = [3, 2, 4, 5] collect_net = core.Net("collect_net") collect_net.GivenTensorFill( [], "input", shape=input_shape, values=[ float(val) for val in range(functools.reduce(operator.mul, input_shape)) ], ) workspace.CreateBlob("output") workspace.FeedBlob("next", np.array(0, dtype=np.int32)) collect_net.LastNWindowCollector( ["output", "next", "input"], ["output", "next"], num_to_collect=7, ) (shapes, types) = workspace.InferShapesAndTypes([collect_net]) workspace.RunNetOnce(collect_net) self.assertTrue( np.array_equal( shapes["output"], np.array([7, *list(workspace.blobs["output"].shape[1:])]) ) )
def add_training_inputs(model, roidb=None): """Create network input ops and blobs used for training. To be called *after* model_builder.create(). """ # Implementation notes: # Typically, one would create the input ops and then the rest of the net. # However, creating the input ops depends on loading the dataset, which # can take a few minutes for COCO. # We prefer to avoid waiting so debugging can fail fast. # Thus, we create the net *without input ops* prior to loading the # dataset, and then add the input ops after loading the dataset. # Since we defer input op creation, we need to do a little bit of surgery # to place the input ops at the start of the network op list. assert model.train, 'Training inputs can only be added to a trainable model' if roidb is not None: # To make debugging easier you can set cfg.DATA_LOADER.NUM_THREADS = 1 model.roi_data_loader = RoIDataLoader( roidb, num_loaders=cfg.DATA_LOADER.NUM_THREADS, minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE, blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY) orig_num_op = len(model.net._net.op) blob_names = roi_data_minibatch.get_minibatch_blob_names(is_training=True) for gpu_id in range(cfg.NUM_GPUS): with c2_utils.NamedCudaScope(gpu_id): for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name)) model.net.DequeueBlobs(model.roi_data_loader._blobs_queue_name, blob_names) # A little op surgery to move input ops to the start of the net diff = len(model.net._net.op) - orig_num_op new_op = model.net._net.op[-diff:] + model.net._net.op[:-diff] del model.net._net.op[:] model.net._net.op.extend(new_op)
def test_last_n_window_ops(self): collect_net = core.Net("collect_net") collect_net.GivenTensorFill( [], "input", shape=[3, 2], values=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ) input_array = np.array(list(range(1, 7)), dtype=np.float32).reshape(3, 2) workspace.CreateBlob("output") workspace.FeedBlob("next", np.array(0, dtype=np.int32)) collect_net.LastNWindowCollector( ["output", "next", "input"], ["output", "next"], num_to_collect=7, ) plan = core.Plan("collect_data") plan.AddStep(core.execution_step("collect_data", [collect_net], num_iter=1)) workspace.RunPlan(plan) reference_result = workspace.FetchBlob("output") npt.assert_array_equal(input_array, reference_result) plan = core.Plan("collect_data") plan.AddStep(core.execution_step("collect_data", [collect_net], num_iter=2)) workspace.RunPlan(plan) reference_result = workspace.FetchBlob("output") npt.assert_array_equal(input_array[[1, 2, 2, 0, 1, 2, 0]], reference_result) plan = core.Plan("collect_data") plan.AddStep(core.execution_step("collect_data", [collect_net], num_iter=3)) workspace.RunPlan(plan) reference_result = workspace.FetchBlob("output") npt.assert_array_equal(input_array[[2, 0, 1, 2, 2, 0, 1]], reference_result)
def main(opts): logger = logging.getLogger(__name__) roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) logger.info('{:d} roidb entries'.format(len(roidb))) roi_data_loader = RoIDataLoader( roidb, num_loaders=cfg.DATA_LOADER.NUM_THREADS, minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE, blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY ) blob_names = roi_data_loader.get_output_names() net = core.Net('dequeue_net') net.type = 'dag' all_blobs = [] for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): for blob_name in blob_names: blob = core.ScopedName(blob_name) all_blobs.append(blob) workspace.CreateBlob(blob) logger.info('Creating blob: {}'.format(blob)) net.DequeueBlobs( roi_data_loader._blobs_queue_name, blob_names) logger.info("Protobuf:\n" + str(net.Proto())) if opts.profiler: import cProfile cProfile.runctx( 'loader_loop(roi_data_loader)', globals(), locals(), sort='cumulative') else: loader_loop(roi_data_loader) roi_data_loader.register_sigint_handler() roi_data_loader.start(prefill=True) total_time = 0 for i in range(opts.num_batches): start_t = time.time() for _ in range(opts.x_factor): workspace.RunNetOnce(net) total_time += (time.time() - start_t) / opts.x_factor logger.info( '{:d}/{:d}: Averge dequeue time: {:.3f}s [{:d}/{:d}]'.format( i + 1, opts.num_batches, total_time / (i + 1), roi_data_loader._minibatch_queue.qsize(), cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE ) ) # Sleep to simulate the time taken by running a little network time.sleep(opts.sleep_time) # To inspect: # blobs = workspace.FetchBlobs(all_blobs) # from IPython import embed; embed() logger.info('Shutting down data loader...') roi_data_loader.shutdown()
def add_image_blob(image_blob_name='image'): if image_blob_name in workspace.Blobs(): return image_blob_name image = Image.open(sample_image_path) image = preproc_image(image) device_opt = core.scope.CurrentDeviceScope() scoped_image_blob = core.ScopedName(image_blob_name) if device_opt is None: workspace.CreateBlob(scoped_image_blob) workspace.FeedBlob(scoped_image_blob, image) else: workspace.CreateBlob(scoped_image_blob, device_option=device_opt) workspace.FeedBlob(scoped_image_blob, image, device_option=device_opt) return image_blob_name, image
def gen_param_update_builder_fun(self, model, dataset, is_train): if not is_train: return None else: # from sherlok for idx in range( self.opts['distributed']['first_xpu_id'], self.opts['distributed']['first_xpu_id'] + self.opts['distributed']['num_xpus']): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, idx)): workspace.CreateBlob('{}_{}/lr'.format( self.opts['distributed']['device'], idx)) def add_parameter_update_ops(model): model.Iter("ITER") weight_decay = model.param_init_net.ConstantFill( [], 'weight_decay', shape=[1], value=self.opts['model_param']['weight_decay']) weight_decay_bn = model.param_init_net.ConstantFill( [], 'weight_decay_bn', shape=[1], value=self.opts['model_param']['weight_decay_bn']) one = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) ''' Add the momentum-SGD update. ''' params = model.GetParams() assert (len(params) > 0) for param in params: param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill([param], param + '_momentum', value=0.0) if '_bn' in str(param): model.WeightedSum( [param_grad, one, param, weight_decay_bn], param_grad) else: model.WeightedSum([param_grad, one, param, weight_decay], param_grad) # Update param_grad and param_momentum in place model.net.MomentumSGDUpdate( [param_grad, param_momentum, 'lr', param], [param_grad, param_momentum, param], momentum=0.9, nesterov=1) return add_parameter_update_ops
def create_model(weights_file): """adapted from utils.train.setup_model_for_training """ model = model_builder.create(cfg.MODEL.TYPE, train=True) if cfg.MEMONGER: optimize_memory(model) # Performs random weight initialization as defined by the model workspace.RunNetOnce(model.param_init_net) roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES ) # To make debugging easier you can set cfg.DATA_LOADER.NUM_THREADS = 1 model.roi_data_loader = RoIDataLoaderSimple( roidb, num_loaders=cfg.DATA_LOADER.NUM_THREADS, minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE, blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY ) orig_num_op = len(model.net._net.op) blob_names = roi_data_minibatch.get_minibatch_blob_names(is_training=True) with c2_utils.NamedCudaScope(0): for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name)) model.net.DequeueBlobs( model.roi_data_loader._blobs_queue_name, blob_names ) # A little op surgery to move input ops to the start of the net diff = len(model.net._net.op) - orig_num_op new_op = model.net._net.op[-diff:] + model.net._net.op[:-diff] del model.net._net.op[:] model.net._net.op.extend(new_op) nu.initialize_gpu_from_weights_file(model, weights_file, gpu_id=0) nu.broadcast_parameters(model) workspace.CreateBlob("gpu_0/track_n_rois_two") workspace.CreateNet(model.net) # Start loading mini-batches and enqueuing blobs model.roi_data_loader.register_sigint_handler() model.roi_data_loader.start(prefill=True) return model
def add_inputs(model, roidb=None, landb=None, proposals=None, split='train'): """Create network input ops and blobs used for training. To be called *after* model_builder.create(). """ # Implementation notes: # Typically, one would create the input ops and then the rest of the net. # However, creating the input ops depends on loading the dataset, which # can take a few minutes for COCO. # We prefer to avoid waiting so debugging can fail fast. # Thus, we create the net *without input ops* prior to loading the # dataset, and then add the input ops after loading the dataset. # Since we defer input op creation, we need to do a little bit of surgery # to place the input ops at the start of the network op list. if roidb is not None: # To make debugging easier you can set cfg.DATA_LOADER.NUM_THREADS = 1 model.roi_data_loader = RoIDataLoader( split=split, roidb=roidb, landb=landb, proposals=proposals, num_loaders=cfg.DATA_LOADER.NUM_THREADS) orig_num_op = len(model.net._net.op) blob_names = roi_data.minibatch_rel.get_minibatch_blob_names(split) for gpu_id in range(cfg.NUM_DEVICES): with c2_utils.NamedCudaScope(gpu_id): for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name)) model.net.DequeueBlobs(model.roi_data_loader._blobs_queue_name, blob_names) workspace.CreateBlob(core.ScopedName('all_obj_word_vecs')) workspace.CreateBlob(core.ScopedName('all_prd_word_vecs')) # A little op surgery to move input ops to the start of the net diff = len(model.net._net.op) - orig_num_op new_op = model.net._net.op[-diff:] + model.net._net.op[:-diff] del model.net._net.op[:] model.net._net.op.extend(new_op)
def init_net(): workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) np.random.seed(cfg.RNG_SEED) cfg.TEST.DATA_TYPE = 'test' if cfg.TEST.TEST_FULLY_CONV is True: cfg.TRAIN.CROP_SIZE = cfg.TRAIN.JITTER_SCALES[0] cfg.TEST.USE_MULTI_CROP = 1 elif cfg.TEST.TEST_FULLY_CONV_FLIP is True: cfg.TRAIN.CROP_SIZE = cfg.TRAIN.JITTER_SCALES[0] cfg.TEST.USE_MULTI_CROP = 2 else: cfg.TRAIN.CROP_SIZE = 224 workspace.ResetWorkspace() test_model = model_builder_video.ModelBuilder(name='{}_test'.format( cfg.MODEL.MODEL_NAME), train=False, use_cudnn=True, cudnn_exhaustive_search=True, split=cfg.TEST.DATA_TYPE) test_model.build_model() if cfg.PROF_DAG: test_model.net.Proto().type = 'prof_dag' else: test_model.net.Proto().type = 'dag' workspace.RunNetOnce(test_model.param_init_net) net = test_model.net checkpoints.load_model_from_params_file_for_test(test_model, cfg.TEST.PARAMS_FILE) # reivse the input blob from `reader_val/reader_test` to new blob that enables frame-sequence input clip_blob = core.BlobReference('gpu_0/data') net.AddExternalInput( clip_blob ) # insert op into network's head needs to rebuild the network, just add an externalinput blob is enough # delete the original video_input_op, blob('gpu_0/data') is feed by this op before and by hand now ops = net.Proto().op # assert 'reader' in ops[0].name assert ops[0].type == 'CustomizedVideoInput' del ops[0] workspace.CreateBlob('gpu_0/data') workspace.CreateNet(net) return net
def get_net(data_loader, name): logger = logging.getLogger(__name__) blob_names = data_loader.get_output_names() net = core.Net(name) net.type = 'dag' for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): for blob_name in blob_names: blob = core.ScopedName(blob_name) workspace.CreateBlob(blob) net.DequeueBlobs(data_loader._blobs_queue_name, blob_names) logger.info("Protobuf:\n" + str(net.Proto())) return net
def create_threads(self): # "worker" threads to construct (partial) minibatches and put them on # minibatch CPU queue in CPU memory (limited by queue size). self._worker_ids = self.get_worker_ids() self._workers = [ threading.Thread( target=self.minibatch_loader, name='worker_{}'.format(worker_id), args=[worker_id], ) for worker_id in self._worker_ids ] # create one BlobsQueue per DEVICE which holds the training data in GPU # memory and feeds to the net prefix, device = helpers.get_prefix_and_device() # the root device id = 0 for device_id in range(0, self._num_devices): with core.NameScope('{}{}'.format(prefix, device_id)): self.create_blobs_queue( queue_name=self._blobs_queue_name, num_blobs=len(self._blobs_idx_map), capacity=self._device_blobs_queue_capacity) # launch enqueuer threads # Create one blob for each (blob_name, enqueuer_thread_id) pair: # <train/test>_<blob_name>_enqueue_<enqueuer_thread_id> # The distinction between train/test here is important since when we use # EnqueueBlobs op, we need to distinguish otherwise data can get mixed. blob_names = self._blobs_idx_map.keys() enqueue_blobs_names = [[ '{}_{}_enqueue_{}'.format(self._split, blob_name, idx) for blob_name in blob_names ] for idx in range(self._num_enqueuers)] for device_id in range(0, self._num_devices): # NameScope is prepended to all the blobs in the workspace with core.NameScope('{}{}'.format(prefix, device_id)): with core.DeviceScope(core.DeviceOption(device, device_id)): for blob_list in enqueue_blobs_names: for blob in blob_list: scoped_blob_name = scope.CurrentNameScope() + blob workspace.CreateBlob(scoped_blob_name) # create the enqueuer threads self._enqueuers = [ threading.Thread(target=self.enqueue_blobs_thread, args=(device_id, enqueue_blobs_names[idx])) for device_id in range(0, self._num_devices) for idx in range(self._num_enqueuers) ]
def create_threads(self): # Create mini-batch loader threads, each of which builds mini-batches # and places them into a queue in CPU memory threading_fn = multiprocessing.Process self._workers = [ threading_fn(target=RoIDataLoader.minibatch_loader2, args=(self.shared_readonly_dict, self._minibatch_queue, self._lock, self.mp_cur, self.mp_perm, self.coordinator)) for _ in range(self._num_workers) ] # Create one BlobsQueue per GPU, each of which feeds a blob in GPU # memory to a net for gpu_id in range(self._num_gpus): with core.NameScope('gpu_{}'.format(gpu_id)): self.create_blobs_queue() # An enqueuer thread moves mini-batches from the shared CPU memory queue # to a GPU blobs queue # Each GPU will have it's own pool of enqueuer threads # Create one blob for each # (loader output, enqueuer thread, RoIDataLoader instance) triple: # <loader_output>_enqueue_<enqueuer_thread_id>_<loader_id> blob_names = self.get_output_names() enqueue_blob_names = [[ '{}_enqueue_{}_{}'.format(blob_name, i, self._loader_id) for blob_name in blob_names ] for i in range(self._num_enqueuers)] for gpu_id in range(self._num_gpus): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope( core.DeviceOption(caffe2_pb2.CUDA, gpu_id)): for blob_list in enqueue_blob_names: for blob in blob_list: workspace.CreateBlob(core.ScopedName(blob)) # Create enqueuer threads self._enqueuers = [ # This is enqueueing into C2, can't be done by multiple processes # so needs to be done using threading module threading.Thread(target=self.enqueue_blobs_thread, args=(gpu_id, enqueue_blob_names[i])) for gpu_id in range(self._num_gpus) for i in range(self._num_enqueuers) ]
def create_threads(self): # "worker" threads to construct (partial) minibatches and put them on # minibatch queue in CPU memory (limited by queue size). self._worker_ids = self.get_worker_ids() self._workers = [ threading.Thread( target=self.minibatch_loader, name='worker_{}'.format(worker_id), args=[worker_id], ) for worker_id in self._worker_ids ] # Create one BlobsQueue per GPU which holds the training data in GPU # memory and feeds to the net. root_gpu_id = cfg.ROOT_GPU_ID for gpu_id in range(root_gpu_id, root_gpu_id + self._num_gpus): with core.NameScope('gpu_{}'.format(gpu_id)): self.create_blobs_queue( queue_name=self._blobs_queue_name, num_blobs=len(self._blobs_idx_map), capacity=self._gpu_blobs_queue_capacity) # Launch enqueuer threads. blob_names = self._blobs_idx_map.keys() enqueue_blobs_names = [ '{}_{}_enqueue'.format(self._split, blob_name) for blob_name in blob_names ] for gpu_id in range(root_gpu_id, root_gpu_id + self._num_gpus): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope( core.DeviceOption(caffe2_pb2.CUDA, gpu_id)): for blob_list in enqueue_blobs_names: for blob in blob_list: scoped_blob_name = scope.CurrentNameScope() + blob workspace.CreateBlob(scoped_blob_name) self._enqueuer = threading.Thread(target=self.enqueue_blobs_thread, args=(0, enqueue_blobs_names))
def test_last_n_window_ops_shape_inference(self): collect_net = core.Net("collect_net") collect_net.GivenTensorFill( [], "input", shape=[3, 2], values=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ) workspace.CreateBlob("output") workspace.FeedBlob("next", np.array(0, dtype=np.int32)) collect_net.LastNWindowCollector( ["output", "next", "input"], ["output", "next"], num_to_collect=7, ) (shapes, types) = workspace.InferShapesAndTypes([collect_net]) workspace.RunNetOnce(collect_net) self.assertTrue( np.array_equal( shapes["output"], np.array([7, workspace.blobs["output"].shape[1]]) ) )
def create_multi_gpu_blob(blob_name): prefix, device = helpers.get_prefix_and_device() for idx in range(0, cfg.NUM_DEVICES): with core.DeviceScope(core.DeviceOption(device, idx)): workspace.CreateBlob('{}{}/{}'.format(prefix, idx, blob_name))
def create_input_blobs_for_net(net_def): for op in net_def.op: for blob_in in op.input: if not workspace.HasBlob(blob_in): workspace.CreateBlob(blob_in)
def input_fn(model): for blob_name in blob_names: workspace.CreateBlob(scope.CurrentNameScope() + blob_name) model.DequeueBlobs(queue_name, blob_names) model.StopGradient('data{}'.format(suffix), 'data{}'.format(suffix))
#-----------------------------------------------------------------------------------------------# from caffe2.python import workspace from models import model_builder_video, resnet_video_org workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) workspace.ResetWorkspace() c2_net = model_builder_video.ModelBuilder( name='test', train=False, use_cudnn=False, cudnn_exhaustive_search=False, split='val') c2_net.net.Proto().type = 'dag' workspace.CreateBlob('data') workspace.CreateBlob('labels') c2_net, out_blob = resnet_video_org.create_model(model=c2_net, data='data', labels='labels', split='val', use_nl=args.model=='r50_nl') workspace.RunNetOnce(c2_net.param_init_net) workspace.CreateNet(c2_net.net) # load pretrained weights if args.model=='r50': wt_file = 'pretrained/i3d_baseline_32x2_IN_pretrain_400k.pkl' elif args.model=='r50_nl': wt_file = 'pretrained/i3d_nonlocal_32x2_IN_pretrain_400k.pkl' wts = pickle.load(open(wt_file, 'rb'), encoding='latin')['blobs'] for key in wts:
def add_train_inputs(model): blob_names = model.roi_data_loader.get_output_names() for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name)) model.net.DequeueBlobs(model.roi_data_loader._blobs_queue_name, blob_names)
def run_inference(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(', ')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus my_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True } model = cnn.CNNModelHelper( name="Extract Features", **my_arg_scope ) # gpu? if num_gpus > 0: log.info("Running on GPUs: {}".format(gpus)) model._device_type = caffe2_pb2.CUDA model._cuda_gpu_id = 0 model._devices = [0] # cpu else: log.info("Running on CPU") model._device_type = caffe2_pb2.CPU model._devices = [0] # create the scope device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format("gpu", 0)): create_model_ops(model, 1.0, args) # gather parameters batch = 1 channels_rgb = args.num_channels frames_per_clip = args.clip_length_rgb crop_size = args.crop_size width = args.scale_w height = args.scale_h input_video = args.input # configuration for the input #data = np.empty((1, channels_rgb, frames_per_clip, crop_size, crop_size)) #label = np.empty((1, 1)) # initialize the network workspace.CreateBlob("gpu_0/data") workspace.CreateBlob("gpu_0/label") workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if args.db_type == 'minidb': with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': model_loader.LoadModelFromPickleFile( model, args.load_model_path, use_gpu=False, ) else: log.warning("Unsupported db_type: {}".format(args.db_type)) outputs = [name.strip() for name in args.features.split(', ')] assert len(outputs) > 0 input_video = cv2.VideoCapture(input_video) with open(args.labels) as f: matching_labels = np.array(json.load(f)) clip_list = [] label = np.empty((1)).astype('int32') # create windows for opencv cv2.namedWindow('frame',cv2.WINDOW_NORMAL) cv2.resizeWindow('frame', 800,600) cv2.namedWindow('processed',cv2.WINDOW_NORMAL) cv2.moveWindow('processed',800,0) while True: # get a frame from the video video_available, frame = input_video.read() if not video_available: break pre_processed_frame = put_in_shape(frame, resize_to=( width, height), crop_to=(crop_size, crop_size)) clip_list.append(pre_processed_frame) if len(clip_list) != frames_per_clip: continue print('sending one set of images to the network!') # put the list of frames in the shape for the network input_clip = pre_process(clip_list, crop_size, crop_size) # remove the first frame del clip_list[0] # send the data to the network workspace.FeedBlob("gpu_0/data", input_clip) workspace.FeedBlob("gpu_0/label", label) # fetch the outputs activations = fetch_activations(model, outputs) # get the score for each class softmax = activations['softmax'] cv2.imshow('frame', frame) cv2.imshow('processed', pre_processed_frame) for i in range(len(softmax)): sorted_preds = \ np.argsort(softmax[i]) sorted_preds[:] = sorted_preds[::-1] put_text_on_image(frame, matching_labels[sorted_preds[0:5]]) cv2.imshow('frame', frame) if cv2.waitKey(1) & 0xff == ord('q'): break cv2.destroyAllWindows()
[softmax, loss] = resnet.create_resnet50(test_model, "data", num_input_channels=3, num_labels=1000, label="label", no_bias=True) device_opts = caffe2_pb2.DeviceOption() device_opts.device_type = caffe2_pb2.CUDA device_opts.cuda_gpu_id = 0 net_def = test_model.net.Proto() net_def.device_option.CopyFrom(device_opts) test_model.param_init_net.RunAllOnGPU(gpu_id=0, use_cudnn=True) workspace.CreateBlob("data") workspace.CreateBlob("label") workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(net_def) workspace.FeedBlob('data', np.random.rand(100, 3, 224, 224).astype(np.float32), device_opts) workspace.FeedBlob('label', np.ones([ 100, ], dtype=np.int32), device_opts) #start = time.time() #for i in range(1000): # workspace.RunNet(net_def.name, 1)
def add_test_inputs(model): blob_names = roi_data.minibatch.get_minibatch_blob_names() for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name))
def input_fn(model): for blob_name in blob_names: workspace.CreateBlob(scope.CurrentNameScope() + blob_name) model.net.DequeueBlobs(queue_name, blob_names) model.StopGradient('data', 'data')
def create_blobs_if_not_existed(blob_names): existd_names = set(workspace.Blobs()) for xx in blob_names: if xx not in existd_names: workspace.CreateBlob(str(xx))
def main(): # Initialize C2 workspace.GlobalInit( ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1']) # Set up logging and load config options logger = setup_logging(__name__) logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO) args = parse_args() logger.info('Called with args:') logger.info(args) if args.cfg_file is not None: merge_cfg_from_file(args.cfg_file) if args.opts is not None: merge_cfg_from_list(args.opts) assert_and_infer_cfg() smi_output, cuda_ver, cudnn_ver = c2_utils.get_nvidia_info() logger.info("cuda version : {}".format(cuda_ver)) logger.info("cudnn version: {}".format(cudnn_ver)) logger.info("nvidia-smi output:\n{}".format(smi_output)) logger.info('Training with config:') logger.info(pprint.pformat(cfg)) # Note that while we set the numpy random seed network training will not be # deterministic in general. There are sources of non-determinism that cannot # be removed with a reasonble execution-speed tradeoff (such as certain # non-deterministic cudnn functions). np.random.seed(cfg.RNG_SEED) # test model logger.info("creat test model ...") test_model = test_engine.initialize_model_from_cfg(cfg.TEST.WEIGHTS, gpu_id=0) logger.info("created test model ...") train_data = DataLoader(root, "train_id.txt", cfg, test_model, is_train=True) # creat mode model, weights_file, start_iter, checkpoints = create_model( True, cfg, output_dir) # test blob print(workspace.Blobs()) # create input blob blob_names = ['data_stage2', 'gt_label_stage2'] for gpu_id in range(cfg.NUM_GPUS): with c2_utils.NamedCudaScope(gpu_id): for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name)) # Override random weight initialization with weights from a saved model if weights_file: nu.initialize_gpu_from_weights_file(model, weights_file, gpu_id=0) # Even if we're randomly initializing we still need to synchronize # parameters across GPUs nu.broadcast_parameters(model) workspace.CreateNet(model.net) logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir))) dump_proto_files(model, output_dir) writer = SummaryWriter(log_dir=output_dir) training_stats = TrainingStats(model, writer) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) logger.info("start train ...") for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): # feed data # print("{} iter starting feed data...".format(cur_iter)) data_stage2, gt_label = train_data.next_batch() with c2_utils.NamedCudaScope(gpu_id): workspace.FeedBlob(core.ScopedName('data_stage2'), data_stage2) workspace.FeedBlob(core.ScopedName('gt_label_stage2'), gt_label) # print("workspace.RunNet(model.net.Proto().name)") training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats(cur_iter) training_stats.LogIterStats(cur_iter, lr) writer.add_scalar('learning_rate', lr, cur_iter) # print("end of RunNet") if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # save train loss and metric state_file = os.path.join(output_dir, 'training_state.json') training_stats.SaveTrainingStates(state_file) # Execute the training run checkpoints = detectron.utils.train.train_model() # Test the trained model if not args.skip_test: test_model(checkpoints['final'], args.multi_gpu_testing, args.opts)
use_cudnn=False) print( 'WARNING: This alexnet implementation can not use CUDNN for some LRN layer related reason. If you can solve this problem, a PR is welcomed.' ) softmax = create_alexnet(model, 'data', num_labels=1000, label=None, no_loss=True) else: raise NotImplementedError net_def = model.net.Proto() net_def.device_option.CopyFrom(device_opts) model.param_init_net.RunAllOnGPU(gpu_id=args.gpu, use_cudnn=True) workspace.CreateBlob('data') # workspace.CreateBlob('label') workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(net_def) else: raise NotImplementedError('%s is not supported yet' % args.network) t2 = time.time() print('Finish loading model in %.4fs' % (t2 - t1)) t1 = time.time() data_list = [ np.random.uniform( -1, 1, (args.batch_size, 3, im_size, im_size)).astype(np.float32) for i in range(int(np.ceil(1.0 * args.n_sample / args.batch_size))) ]
def main(args): logger = logging.getLogger(__name__) merge_cfg_from_file(args.cfg) cfg.NUM_GPUS = 1 assert_and_infer_cfg(cache_urls=False) import_detectron_ops() init_net = caffe2_pb2.NetDef() predict_net = caffe2_pb2.NetDef() with open(os.path.join(args.model_dir, "model_init.pb"), 'rb') as f: init_net.ParseFromString(f.read()) with open(os.path.join(args.model_dir, "model.pb"), 'rb') as f: predict_net.ParseFromString(f.read()) workspace.ResetWorkspace() workspace.RunNetOnce(init_net) for op in predict_net.op: for blob_in in op.input: if not workspace.HasBlob(blob_in): workspace.CreateBlob(blob_in) logger.info('Operators Are Loaded') workspace.CreateNet(predict_net) logger.info('Predictor Net Created') assert not cfg.MODEL.RPN_ONLY, \ 'RPN models are not supported' assert not cfg.TEST.PRECOMPUTED_PROPOSALS, \ 'Models that require precomputed proposals are not supported' #model = infer_engine.initialize_model_from_cfg(args.weights) if os.path.isdir(args.im_or_folder): im_list = glob.iglob(args.im_or_folder + '/*.' + args.image_ext) else: im_list = [args.im_or_folder] if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) for i, im_name in enumerate(im_list): image_file_name = os.path.basename(im_name) if args.output_type == 'coco-json': ext = 'json' else: ext = 'xml' out_name = os.path.join( args.output_dir, '{}'.format(os.path.splitext(image_file_name)[0] + '.' + ext) ) logger.info('Processing {} -> {}'.format(im_name, out_name)) im = cv2.imread(im_name) im_h, im_w, img_c = im.shape scale_factor = 1.0 if im_h < im_w: if im_h > MAX_DIMENSION_SHORT_SIDE: scale_factor = float(MAX_DIMENSION_SHORT_SIDE) / float(im_h) else: if im_w > MAX_DIMENSION_SHORT_SIDE: scale_factor = float(MAX_DIMENSION_SHORT_SIDE) / float(im_w) if scale_factor != 1.0: im = cv2.resize(im, (int(round(float(im_w) * scale_factor)), int(round(float(im_h) * scale_factor)))) timers = defaultdict(Timer) t = time.time() cls_boxes, cls_segms, cls_keyps = im_detect_all(workspace, predict_net, im, None, timers=timers) logger.info('Inference time: {:.3f}s'.format(time.time() - t)) for k, v in timers.items(): logger.info(' | {}: {:.3f}s'.format(k, v.average_time)) if i == 0: logger.info( ' \ Note: inference on the first image will be slower than the ' 'rest (caches and auto-tuning need to warm up)' ) if isinstance(cls_boxes, list): (boxes, segms, keyps, classes) = convert_from_cls_format(cls_boxes, cls_segms, cls_keyps) m = re.match('([0-9]{9})_([0-9]{5})', image_file_name) dgs_str = m.group(1) img_num_str = m.group(2) image_url = 'https://das.familysearch.org/das/v2/dgs:' + dgs_str + '.' + dgs_str + '_' + img_num_str + '/$dist' bboxes = make_bboxes(boxes) logger.info('{} lines found'.format(len(bboxes))) if args.output_type == 'coco-json': data = new_json() image = { 'license': 1, 'file_name': image_file_name, 'coco_url': image_url, 'height': im_h, 'width': im_w, 'date_captured': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'flickr_url': image_url, 'id': 1 } data['images'].append(image) else: data = new_xml() page = SubElement(data, 'Page') page.set('imageFilename', image_file_name) page.set('imageWidth', str(im_w)) page.set('imageHeight', str(im_h)) tr = SubElement(page, 'TextRegion') tr.set('id', 'region0') coords = SubElement(tr, 'Coords') coords.set('points', '0,0 {},0 {},{} 0,{}'.format(im_w, im_w, im_h, im_h)) next_annotation_id = 1 next_line_id = 1 next_sep_id = 1 next_ld_id = 1 next_gra_id = 1 j = 0 if segms is not None: for segm in segms: mask = mask_util.decode(segm) contours = measure.find_contours(mask, 0.5) segmentation = [] (bbox, score1) = bboxes[j] score2 = boxes[j, -1] logging.debug('score1: {} score2: {}'.format(score1, score2)) if score2 >= args.thresh: bbox = [x / scale_factor for x in bbox] for contour in contours: contour = np.flip(contour, axis=1) seg = contour.ravel().tolist() seg = [x / scale_factor for x in seg] segmentation.append(seg) if args.output_type == 'coco-json': area = calc_area(segmentation) annotation = { 'segmentation': segmentation, 'score': float(score2), 'area': area, 'iscrowd': 0, 'image_id': 1, 'bbox': bbox, 'category_id': classes[j], 'id': next_annotation_id } data['annotations'].append(annotation) next_annotation_id += 1 else: if _r_category_map[classes[j]] == 'handwritten-cursive' or _r_category_map[classes[j]] == 'printed': elem = SubElement(tr, 'TextLine') elem.set('production', _r_category_map[classes[j]], ) elem.set('id', 'tl' + str(next_line_id)) next_line_id += 1 elif _r_category_map[classes[j]] == 'separator': elem = SubElement(page, 'SeparatorRegion') elem.set('id', 'sr' + str(next_sep_id)) next_sep_id += 1 elif _r_category_map[classes[j]] == 'line-drawing': elem = SubElement(page, 'LineDrawingRegion') elem.set('id', 'ldr' + str(next_ld_id)) next_ld_id += 1 else: # graphic elem = SubElement(page, 'GraphicRegion') elem.set('id', 'gr' + str(next_gra_id)) next_gra_id += 1 coord_string = convert_to_xml_coords(segmentation) coords = SubElement(elem, 'Coords') coords.set('points', coord_string) else: logging.info('Not keeping line with confidence {} below threshold of {}'.format(score2, args.thresh)) j += 1 with open(out_name, 'w') as outfile: if args.output_type == 'coco-json': #pdb.set_trace() json.dump(data, outfile, indent=4) else: outfile.write(prettify(data)) else: logger.info('Nothing found in image {}'.format(image_file_name))