def loadCheckpoint(self): opts = self.opts previous_checkpoint = opts['temp_var']['checkpoint_model'] pretrained_model = opts['temp_var']['pretrained_model'] num_xpus = opts['distributed']['num_xpus'] if (previous_checkpoint is not None): if os.path.exists(previous_checkpoint): log.info( 'Load previous checkpoint:{}'.format(previous_checkpoint)) start_epoch, prev_checkpointed_lr, _best_metric = \ checkpoint.initialize_params_from_file( model=self.train_model, weights_file=previous_checkpoint, num_xpus=num_xpus, opts=opts, broadcast_computed_param=True, reset_epoch=False, ) elif pretrained_model is not None and os.path.exists(pretrained_model): log.info("Load pretrained model: {}".format(pretrained_model)) start_epoch, prev_checkpointed_lr, best_metric = \ checkpoint.initialize_params_from_file( model=self.train_model, weights_file=pretrained_model, num_xpus=num_xpus, opts=opts, broadcast_computed_param=True, reset_epoch=opts['model_param']['reset_epoch'], ) data_parallel_model.FinalizeAfterCheckpoint(self.train_model)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name="ban-pc-resnet50", arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model configs for constructing model with open(args.model_config) as f: model_config = yaml.load(f) # Model building functions def create_target_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = add_se_model(model, model_config, "data", is_test=False) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') loss = add_softmax_loss(model, pred, 'label') brew.accuracy(model, ['softmax', 'label'], 'accuracy') return [loss] def add_optimizer(model): ''' stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) ''' optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, base_learning_rate = args.base_learning_rate, momentum = model_config['solver']['momentum'], nesterov = model_config['solver']['nesterov'], policy = model_config['solver']['lr_policy'], power = model_config['solver']['power'], max_iter = model_config['solver']['max_iter'], ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_target_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) if args.model_parallel: # Shift half of the activations to another GPU assert workspace.NumCudaDevices() >= 2 * args.num_gpus activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name="ban-pc-resnet50_test", arg_scope=test_arg_scope, init_params=False ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_target_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "log/{}/resnet50_gpu{}_b{}_L{}_lr{:.2f}_v2".format( args.dataset_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Load pretrained param_init_net load_init_net_multigpu(args) # Run the training one epoch a time best_accuracy = 0 while epoch < args.num_epochs: epoch, best_accuracy = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, best_accuracy, ) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % ( args.file_store_path, args.save_model_name ) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args): if args.model == "resnext": model_name = "resnext" + str(args.num_layers) elif args.model == "shufflenet": model_name = "shufflenet" # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Verify valid image mean/std per channel if args.image_mean_per_channel: assert \ len(args.image_mean_per_channel) == args.num_channels, \ "The number of channels of image mean doesn't match input" if args.image_std_per_channel: assert \ len(args.image_std_per_channel) == args.num_channels, \ "The number of channels of image std doesn't match input" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object if args.use_ideep: train_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, 'training_mode': 1 } else: train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name=model_name, arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions def create_resnext_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnext( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, num_layers=args.num_layers, num_groups=args.resnext_num_groups, num_width_per_group=args.resnext_width_per_group, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def create_shufflenet_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = shufflenet.create_shufflenet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-precision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1 ) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, mean_per_channel=args.image_mean_per_channel, std_per_channel=args.image_std_per_channel, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnext_model_ops if args.model == "resnext" else create_shufflenet_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, use_nccl=args.use_nccl, cpu_device=args.use_cpu, ideep=args.use_ideep, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") if args.use_ideep: test_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, } else: test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name=model_name + "_test", arg_scope=test_arg_scope, init_params=False, ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, mean_per_channel=args.image_mean_per_channel, std_per_channel=args.image_std_per_channel, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnext_model_ops if args.model == "resnext" else create_shufflenet_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, use_nccl=args.use_nccl, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model, args.use_ideep) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % ( model_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog ) # Save the model for each epoch SaveModel(args, train_model, epoch, args.use_ideep) model_path = "%s/%s_" % ( args.file_store_path, args.save_model_name ) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Modify to make it consistent with the distributed trainer total_batch_size = args.batch_size * num_gpus batch_per_device = args.batch_size # Round down epoch size to closest multiple of batch size across machines epoch_iters = int(args.epoch_size / total_batch_size) args.epoch_size = epoch_iters * total_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create CNNModeLhelper object train_model = cnn.CNNModelHelper( order="NCHW", name='{}_train'.format(args.model_name), use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True, ws_nbytes_limit=(args.cudnn_workspace_limit_mb * 1024 * 1024), ) # Model building functions def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, batch_size=args.batch_size, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=(args.clip_length_of if args.input_type else args.clip_length_rgb), loss_scale=loss_scale, pred_layer_name=args.pred_layer_name, multi_label=args.multi_label, channel_multiplier=args.channel_multiplier, bottleneck_multiplier=args.bottleneck_multiplier, use_dropout=args.use_dropout, conv1_temporal_stride=args.conv1_temporal_stride, conv1_temporal_kernel=args.conv1_temporal_kernel, use_pool1=args.use_pool1, audio_input_3d=args.audio_input_3d, g_blend=args.g_blend, audio_weight=args.audio_weight, visual_weight=args.visual_weight, av_weight=args.av_weight, ) # SGD def add_parameter_update_ops(model): model.AddWeightDecay(args.weight_decay) ITER = model.Iter("ITER") stepsz = args.step_epoch * args.epoch_size / args.batch_size / num_gpus LR = model.net.LearningRate( [ITER], "LR", base_lr=args.base_learning_rate * num_gpus, policy="step", stepsize=int(stepsz), gamma=args.gamma, ) AddMomentumParameterUpdate(model, LR) # Input. Note that the reader must be shared with all GPUS. train_reader, train_examples = reader_utils.create_data_reader( train_model, name="train_reader", input_data=args.train_data, ) log.info("Training set has {} examples".format(train_examples)) def add_video_input(model): model_helper.AddVideoInput( model, train_reader, batch_size=batch_per_device, length_rgb=args.clip_length_rgb, clip_per_video=1, random_mirror=True, decode_type=0, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, video_res_type=args.video_res_type, short_edge=min(args.scale_h, args.scale_w), num_decode_threads=args.num_decode_threads, do_multi_label=args.multi_label, num_of_class=args.num_labels, random_crop=True, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0 or args.input_type >= 3), get_optical_flow=(args.input_type == 1 or args.input_type >= 4), get_logmels=(args.input_type >= 2), get_video_id=args.get_video_id, jitter_scales=[int(n) for n in args.jitter_scales.split(',')], use_local_file=args.use_local_file, ) # Create parallelized model data_parallel_model.Parallelize_GPU( train_model, input_builder_fun=add_video_input, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=add_parameter_update_ops, devices=gpus, rendezvous=None, net_type=('prof_dag' if args.profiling == 1 else 'dag'), optimize_gradient_memory=True, ) # Add test model, if specified test_model = None if args.test_data is not None: log.info("----- Create test net ----") test_model = cnn.CNNModelHelper( order="NCHW", name='{}_test'.format(args.model_name), use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True) test_reader, test_examples = reader_utils.create_data_reader( test_model, name="test_reader", input_data=args.test_data, ) log.info("Testing set has {} examples".format(test_examples)) def test_input_fn(model): model_helper.AddVideoInput( model, test_reader, batch_size=batch_per_device, length_rgb=args.clip_length_rgb, clip_per_video=1, decode_type=0, random_mirror=False, random_crop=False, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, video_res_type=args.video_res_type, short_edge=min(args.scale_h, args.scale_w), num_decode_threads=args.num_decode_threads, do_multi_label=args.multi_label, num_of_class=args.num_labels, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0), get_optical_flow=(args.input_type == 1), get_video_id=args.get_video_id, use_local_file=args.use_local_file, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, devices=gpus, optimize_gradient_memory=True, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: if args.db_type == 'pickle': model_loader.LoadModelFromPickleFile(train_model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0]) else: model_helper.LoadModel(args.load_model_path, args.db_type) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint( train_model, GetCheckpointParams(train_model), ) if args.is_checkpoint: # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "%s_gpu%d_b%d_L%d_lr%.2f" % ( args.model_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, 1, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create CNNModeLhelper object train_model = cnn.CNNModelHelper( order="NCHW", name="resnet50", use_cudnn=True, cudnn_exhaustive_search=True, ws_nbytes_limit=(args.cudnn_workspace_limit_mb * 1024 * 1024), ) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): [softmax, loss] = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label", no_bias=True, ) loss = model.Scale(loss, scale=loss_scale) model.Accuracy([softmax, "label"], "accuracy") return [loss] # SGD def add_parameter_update_ops(model): model.AddWeightDecay(args.weight_decay) ITER = model.Iter("ITER") stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) LR = model.net.LearningRate( [ITER], "LR", base_lr=args.base_learning_rate, policy="step", stepsize=stepsz, gamma=0.1, ) AddMomentumParameterUpdate(model, LR) # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, ) # Create parallelized model data_parallel_model.Parallelize_GPU( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=add_parameter_update_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, ) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_model = cnn.CNNModelHelper(order="NCHW", name="resnet50_test", use_cudnn=True, cudnn_exhaustive_search=True) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint( train_model, GetCheckpointParams(train_model), ) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet101", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id interfaces = args.distributed_interfaces.split(",") if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: store_handler = "store_handler" if args.redis_host is not None: workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None def create_resnet101_model_ops(model, loss_scale): initializer = (pFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnet101( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: opt = optimizer.build_fp16_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet101_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, ) if args.model_parallel: activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet101_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet101_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) data_parallel_model.FinalizeAfterCheckpoint(train_model) last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet101_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # final save SaveModel(workspace, train_model)
def extract_features(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: log.info("Running on GPUs: {}".format(gpus)) else: log.info("Running on CPU") my_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True } model = cnn.CNNModelHelper( name="Extract Features", **my_arg_scope ) video_input_args = dict( batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=args.decode_type, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, video_res_type=args.video_res_type, short_edge=min(args.scale_h, args.scale_w), num_decode_threads=args.num_decode_threads, do_multi_label=args.multi_label, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=args.input_type == 0, get_optical_flow=args.input_type == 1, get_video_id=args.get_video_id, get_start_frame=args.get_start_frame, use_local_file=args.use_local_file, crop_per_clip=args.crop_per_clip, ) reader_args = dict( name="extract_features" + '_reader', input_data=os.path.join(vmz_data.VMZ_DIR, args.test_data), ) reader, num_examples = reader_utils.create_data_reader( model, **reader_args ) def input_fn(model): model_helper.AddVideoInput( model, reader, **video_input_args) def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, batch_size=args.batch_size, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=( args.clip_length_of if args.input_type == 1 else args.clip_length_rgb ), loss_scale=loss_scale, is_test=1, multi_label=args.multi_label, channel_multiplier=args.channel_multiplier, bottleneck_multiplier=args.bottleneck_multiplier, use_dropout=args.use_dropout, use_convolutional_pred=args.use_convolutional_pred, use_pool1=args.use_pool1, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, # 'None' since we aren't training devices=gpus, optimize_gradient_memory=True, ) else: model._device_type = caffe2_pb2.CPU model._devices = [0] device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): input_fn(model) create_model_ops(model, 1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if args.db_type == 'pickle': load_model_path = os.path.join(vmz_data.PRETRAINED_MODEL_DIR, args.load_model_path) model_loader.LoadModelFromPickleFile(model, load_model_path) elif args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) else: log.warning("Unsupported db_type: {}".format(args.db_type)) data_parallel_model.FinalizeAfterCheckpoint(model) def fetchActivations(model, outputs, num_iterations): all_activations = {} for counter in range(num_iterations): workspace.RunNet(model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): for output_name in outputs: blob_name = 'gpu_{}/'.format(g) + output_name activations = workspace.FetchBlob(blob_name) if output_name not in all_activations: all_activations[output_name] = [] all_activations[output_name].append(activations) if counter % 20 == 0: log.info('{}/{} iterations'.format(counter, num_iterations)) # each key holds a list of activations obtained from each minibatch. # we now concatenate these lists to get the final arrays. # concatenating during the loop requires a realloc and can get slow. for key in all_activations: all_activations[key] = np.concatenate(all_activations[key]) return all_activations outputs = [name.strip() for name in args.features.split(',')] assert len(outputs) > 0 if args.num_iterations > 0: num_iterations = args.num_iterations else: if num_gpus > 0: examples_per_iteration = args.batch_size * num_gpus else: examples_per_iteration = args.batch_size num_iterations = int(num_examples / examples_per_iteration) activations = fetchActivations(model, outputs, num_iterations) # saving extracted features for index in range(len(outputs)): log.info( "Read '{}' with shape {}".format( outputs[index], activations[outputs[index]].shape ) ) output_path = os.path.join(vmz_data.VMZ_FEATURE_DIR, args.output_path) if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) log.info('Writing to {}'.format(output_path)) if args.save_h5: with h5py.File(output_path, 'w') as handle: for name, activation in activations.items(): handle.create_dataset(name, data=activation) else: with open(output_path, 'wb') as handle: pickle.dump(activations, handle) # perform sanity check if args.sanity_check == 1: # check clip accuracy assert args.multi_label == 0 clip_acc = 0 softmax = activations['softmax'] label = activations['label'] for i in range(len(softmax)): sorted_preds = \ np.argsort(softmax[i]) sorted_preds[:] = sorted_preds[::-1] if sorted_preds[0] == label[i]: clip_acc += 1 log.info('Sanity check --- clip accuracy: {}'.format( clip_acc / len(softmax)) ) elif args.sanity_check == 2: # check auc assert args.multi_label == 1 prob = activations['prob'] label = activations['label'] mean_auc, mean_ap, mean_wap, _ = metric.mean_ap_metric(prob, label) log.info('Sanity check --- AUC: {}, mAP: {}, mWAP: {}'.format( mean_auc, mean_ap, mean_wap) )
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions # def create_resnet50_model_ops(model, loss_scale): # initializer = (PseudoFP16Initializer if args.dtype == 'float16' # else Initializer) # with brew.arg_scope([brew.conv, brew.fc], # WeightInitializer=initializer, # BiasInitializer=initializer, # enable_tensor_core=args.enable_tensor_core, # float16_compute=args.float16_compute): # pred = resnet.create_resnet50( # #args.layers, # model, # "data", # num_input_channels=args.num_channels, # num_labels=args.num_labels, # no_bias=True, # no_loss=True, # ) # if args.dtype == 'float16': # pred = model.net.HalfToFloat(pred, pred + '_fp32') # softmax, loss = model.SoftmaxWithLoss([pred, 'label'], # ['softmax', 'loss']) # loss = model.Scale(loss, scale=loss_scale) # brew.accuracy(model, [softmax, "label"], "accuracy") # return [loss] def create_model_ops(model, loss_scale): return create_model_ops_testable(model, loss_scale, is_test=False) def create_model_ops_test(model, loss_scale): return create_model_ops_testable(model, loss_scale, is_test=True) # Model building functions def create_model_ops_testable(model, loss_scale, is_test=False): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): if args.model == "cifar10": if args.image_size != 32: log.warn("Cifar10 expects a 32x32 image.") pred = models.cifar10.create_cifar10( model, "data", image_channels=args.num_channels, num_classes=args.num_labels, image_height=args.image_size, image_width=args.image_size, ) elif args.model == "resnet32x32": if args.image_size != 32: log.warn("ResNet32x32 expects a 32x32 image.") pred = models.resnet.create_resnet32x32( model, "data", num_layers=args.num_layers, num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "resnet": if args.image_size != 224: log.warn( "ResNet expects a 224x224 image. input image = %d" % args.image_size) pred = resnet.create_resnet50( #args.layers, model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) elif args.model == "vgg": if args.image_size != 224: log.warn("VGG expects a 224x224 image.") pred = vgg.create_vgg(model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, num_layers=args.num_layers, is_test=is_test) elif args.model == "googlenet": if args.image_size != 224: log.warn("GoogLeNet expects a 224x224 image.") pred = googlenet.create_googlenet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "alexnet": if args.image_size != 224: log.warn("Alexnet expects a 224x224 image.") pred = alexnet.create_alexnet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "alexnetv0": if args.image_size != 224: log.warn("Alexnet v0 expects a 224x224 image.") pred = alexnet.create_alexnetv0( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) else: raise NotImplementedError("Network {} not found.".format( args.model)) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-prceision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) print("info:===============================" + str(opt)) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize(train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, use_nccl=args.use_nccl) if args.model_parallel: # Shift half of the activations to another GPU assert workspace.NumCudaDevices() >= 2 * args.num_gpus activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) if "GLOO_ALGORITHM" in os.environ and os.environ[ "GLOO_ALGORITHM"] == "PHUB": #i need to communicate to PHub about the elements that need aggregation, #as well as their sizes. #at this stage, all i need is the name of keys and my key ID. grad_names = list(reversed(train_model._grad_names)) phubKeyNames = ["allreduce_{}_status".format(x) for x in grad_names] caffe2GradSizes = dict( zip([ data_parallel_model.stripBlobName(name) + "_grad" for name in train_model._parameters_info.keys() ], [x.size for x in train_model._parameters_info.values()])) phubKeySizes = [str(caffe2GradSizes[x]) for x in grad_names] if rendezvous["shard_id"] == 0: #only id 0 needs to send to rendezvous. r = redis.StrictRedis() #foreach key, I need to assign an ID joinedStr = ",".join(phubKeyNames) r.set("[PLink]IntegrationKeys", joinedStr) joinedStr = ",".join(phubKeySizes) r.set("[PLink]IntegrationKeySizes", joinedStr) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops_test, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args): subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') save_dir = os.path.join(args.file_store_path, subdir) if not os.path.exists(save_dir): # Create the model directory if it doesn't exist os.mkdir(save_dir) # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name="sphereface", arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions def create_sphereface_model_ops(model, loss_scale): initializer = (pFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core): pred = sphereface.create_net( model, "data", "label", in_dim=args.num_channels, class_num=args.num_labels, feature_dim=args.feature_dim, is_test=False, no_loss=True, fp16_data=True if args.dtype == 'float16' else False, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): # stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) stepsz = 1 if args.dtype == 'float16': opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.9999 ) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.9999 ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_sphereface_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, cpu_device=args.use_cpu, shared_model=args.use_cpu, ) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name="sphereface_test", arg_scope=test_arg_scope, init_params=False ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_sphereface_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) graph = net_drawer2.GetPydotGraphMinimal(test_model.net.Proto(), "sphereface", rankdir="TB") graph.write(os.path.join(save_dir, "sphereface.pdf"), format='pdf') epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "sphereface_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(os.path.join(save_dir, expname), args) kernel_fig, plt_kernel = plt.subplots(nrows=4, ncols=5, figsize=(14, 14)) loss_fig, plt_loss = plt.subplots(1) plt.tight_layout(h_pad=0, w_pad=0) plt.ion() iterations = 0 old_x = 0 old_loss = 0 old_acc = 0 while epoch < args.num_epochs or args.test_data_type != 'VAL': epoch, epoch_loss, epoch_accuracy = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, explog, plt_kernel ) x = list(range(iterations, iterations + len(epoch_loss))) x.insert(0, old_x) epoch_loss.insert(0, old_loss) epoch_accuracy.insert(0, old_acc) plt_loss.plot(x, epoch_loss, 'b') plt_loss.plot(x, epoch_accuracy, 'r') iterations += len(epoch_loss) old_x = iterations - 2 old_loss = epoch_loss[-1] old_acc = epoch_accuracy[-1] log.info('Save checkpoint {}'.format(epoch)) model_path = '{:s}/{:s}_{:d}.mdl'.format(save_dir, args.save_model_name, epoch) SaveModel(args, train_model, model_path) if DEBUG_TRAINING: kernel_fig_path = '%s/%s_%d.jpg' % (save_dir, 'activation', epoch) loss_fig_path = '%s/%s_%d.jpg' % (save_dir, 'loss', epoch) kernel_fig.savefig(kernel_fig_path) loss_fig.savefig(loss_fig_path)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): initializer = (pFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core): pred = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, cpu_device=args.use_cpu, ) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params print("=-=111=") data_parallel_model.FinalizeAfterCheckpoint(train_model) print("=-=====") # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
def Test(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: total_batch_size = args.batch_size * num_gpus log.info("Running on GPUs: {}".format(gpus)) log.info("total_batch_size: {}".format(total_batch_size)) else: total_batch_size = args.batch_size log.info("Running on CPU") log.info("total_batch_size: {}".format(total_batch_size)) # Model building functions def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=( args.clip_length_of if args.input_type == 1 else args.clip_length_rgb ), loss_scale=loss_scale, is_test=1, pred_layer_name=args.pred_layer_name, ) test_model = cnn.CNNModelHelper( order="NCHW", name="video_model_test", use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True, ) test_reader, number_of_examples = model_builder.create_data_reader( test_model, name="test_reader", input_data=args.test_data, ) if args.num_iter <= 0: num_iter = int(number_of_examples / total_batch_size) else: num_iter = args.num_iter def test_input_fn(model): model_helper.AddVideoInput( test_model, test_reader, batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=1, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, num_decode_threads=4, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0), get_optical_flow=(args.input_type == 1), get_video_id=args.get_video_id, use_local_file=args.use_local_file, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, devices=gpus ) else: test_model._device_type = caffe2_pb2.CPU test_model._devices = [0] device_opt = core.DeviceOption(test_model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): test_input_fn(test_model) create_model_ops(test_model, 1.0) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) if args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) data_parallel_model.FinalizeAfterCheckpoint(test_model) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': if num_gpus > 0: model_loader.LoadModelFromPickleFile( test_model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0] ) data_parallel_model.FinalizeAfterCheckpoint(test_model) else: model_loader.LoadModelFromPickleFile( test_model, args.load_model_path, use_gpu=False ) else: log.warning("Unsupported db_type: {}".format(args.db_type)) # metric counters for classification clip_acc = 0 video_top1 = 0 video_topk = 0 video_count = 0 clip_count = 0 for i in range(num_iter): workspace.RunNet(test_model.net.Proto().name) num_devices = 1 # default for cpu if args.num_gpus > 0: num_devices = args.num_gpus for g in range(num_devices): # get labels label = workspace.FetchBlob( "gpu_{}".format(g) + '/label' ) # get predictions predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax') assert predicts.shape[0] == args.batch_size * args.clip_per_video for j in range(args.batch_size): # get label for one video sample_label = label[j * args.clip_per_video] # get clip accuracy for k in range(args.clip_per_video): c1, _ = metric.accuracy_metric( predicts[j * args.clip_per_video + k, :], label[j * args.clip_per_video + k]) clip_acc = clip_acc + c1 # get all clip predictions for one video all_clips = predicts[ j * args.clip_per_video:(j + 1) * args.clip_per_video, :] # aggregate predictions into one video_pred = PredictionAggregation(all_clips, args.aggregation) c1, ck = metric.accuracy_metric( video_pred, sample_label, args.top_k) video_top1 = video_top1 + c1 video_topk = video_topk + ck video_count = video_count + args.batch_size clip_count = clip_count + label.shape[0] if i > 0 and i % args.display_iter == 0: log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format( i, num_iter, clip_acc / clip_count, video_top1 / video_count, video_topk / video_count)) log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format( clip_acc / clip_count, video_top1 / video_count, args.top_k, video_topk / video_count )) if num_gpus > 0: flops, params = model_helper.GetFlopsAndParams(test_model, gpus[0]) else: flops, params = model_helper.GetFlopsAndParams(test_model) log.info('FLOPs: {}, params: {}'.format(flops, params))
def ExtractFeatures(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: log.info("Running on GPUs: {}".format(gpus)) else: log.info("Running on CPU") log.info("Running on GPUs: {}".format(gpus)) my_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True } model = cnn.CNNModelHelper(name="Extract Features", **my_arg_scope) reader, num_examples = model_builder.create_data_reader( model, name="reader", input_data=args.test_data, ) def input_fn(model): model_helper.AddVideoInput( model, reader, batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=args.decode_type, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, num_decode_threads=args.num_decode_threads, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0), get_optical_flow=(args.input_type == 1), get_video_id=args.get_video_id, use_local_file=args.use_local_file, ) def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=(args.clip_length_of if args.input_type == 1 else args.clip_length_rgb), loss_scale=loss_scale, is_test=1, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, # 'None' since we aren't training devices=gpus, ) else: model._device_type = caffe2_pb2.CPU model._devices = [0] device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format("gpu", 0)): input_fn(model) create_model_ops(model, 1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) data_parallel_model.FinalizeAfterCheckpoint(model) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': if num_gpus > 0: model_loader.LoadModelFromPickleFile(model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0]) else: model_loader.LoadModelFromPickleFile( model, args.load_model_path, use_gpu=False, ) else: log.warning("Unsupported db_type: {}".format(args.db_type)) def fetchActivations(model, outputs, num_iterations): all_activations = {} for counter in range(num_iterations): workspace.RunNet(model.net.Proto().name) num_devices = 1 # default for cpu for g in gpus if num_gpus > 0 else range(num_devices): for output_name in outputs: blob_name = 'gpu_{}/'.format(g) + output_name activations = workspace.FetchBlob(blob_name) if output_name not in all_activations: all_activations[output_name] = [] all_activations[output_name].append(activations) if counter % 20 == 0: log.info('{}/{} iterations'.format(counter, num_iterations)) # each key holds a list of activations obtained from each minibatch. # we now concatenate these lists to get the final arrays. # concatenating during the loop requires a realloc and can get slow. for key in all_activations: all_activations[key] = np.concatenate(all_activations[key]) return all_activations outputs = [name.strip() for name in args.features.split(',')] assert len(outputs) > 0 if args.num_iterations > 0: num_iterations = args.num_iterations else: if num_gpus > 0: examples_per_iteration = args.batch_size * num_gpus else: examples_per_iteration = args.batch_size num_iterations = int(num_examples / examples_per_iteration) activations = fetchActivations(model, outputs, num_iterations) # saving extracted features for index in range(len(outputs)): log.info("Read '{}' with shape {}".format( outputs[index], activations[outputs[index]].shape)) if args.output_path: output_path = args.output_path else: output_path = os.path.dirname(args.test_data) + '/features.pickle' log.info('Writing to {}'.format(output_path)) with open(output_path, 'wb') as handle: pickle.dump(activations, handle) # perform sanity check if args.sanity_check == 1: # check clip accuracy clip_acc = 0 softmax = activations['softmax'] label = activations['label'] for i in range(len(softmax)): sorted_preds = \ np.argsort(softmax[i]) sorted_preds[:] = sorted_preds[::-1] if sorted_preds[0] == label[i]: clip_acc += 1 log.info('Sanity check --- clip accuracy: {}'.format(clip_acc / len(softmax)))
def Test(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: total_batch_size = args.batch_size * num_gpus log.info("Running on GPUs: {}".format(gpus)) log.info("total_batch_size: {}".format(total_batch_size)) else: total_batch_size = args.batch_size log.info("Running on CPU") log.info("total_batch_size: {}".format(total_batch_size)) video_input_args = dict( batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=1, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, video_res_type=args.video_res_type, short_edge=min(args.scale_h, args.scale_w), num_decode_threads=args.num_decode_threads, do_multi_label=args.multi_label, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0 or args.input_type >= 3), get_optical_flow=(args.input_type == 1 or args.input_type >= 4), use_local_file=args.use_local_file, crop_per_clip=args.crop_per_clip, ) reader_args = dict( name="test_reader", input_data=args.test_data, ) # Model building functions def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, batch_size=args.batch_size * args.clip_per_video * args.crop_per_clip, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=(args.clip_length_of if args.input_type == 1 else args.clip_length_rgb), loss_scale=loss_scale, is_test=1, pred_layer_name=args.pred_layer_name, multi_label=args.multi_label, channel_multiplier=args.channel_multiplier, bottleneck_multiplier=args.bottleneck_multiplier, use_dropout=args.use_dropout, conv1_temporal_stride=args.conv1_temporal_stride, conv1_temporal_kernel=args.conv1_temporal_kernel, use_convolutional_pred=args.use_convolutional_pred, use_pool1=args.use_pool1, ) test_model = cnn.CNNModelHelper( order="NCHW", name="video_model_test", use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True, ) test_reader, number_of_examples = reader_utils.create_data_reader( test_model, **reader_args) if args.num_iter <= 0: num_iter = int(math.ceil(number_of_examples / total_batch_size)) else: num_iter = args.num_iter def test_input_fn(model): model_helper.AddVideoInput(test_model, test_reader, **video_input_args) if num_gpus > 0: data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, devices=gpus, optimize_gradient_memory=True, ) else: test_model._device_type = caffe2_pb2.CPU test_model._devices = [0] device_opt = core.DeviceOption(test_model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): test_input_fn(test_model) create_model_ops(test_model, 1.0) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) if args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': if num_gpus > 0: model_loader.LoadModelFromPickleFile(test_model, args.load_model_path, use_gpu=True, root_gpu_id=gpus[0]) else: model_loader.LoadModelFromPickleFile(test_model, args.load_model_path, use_gpu=False) else: log.warning("Unsupported db_type: {}".format(args.db_type)) data_parallel_model.FinalizeAfterCheckpoint(test_model) # metric couters for multilabel all_prob_for_map = np.empty(shape=[0, args.num_labels], dtype=np.float) all_label_for_map = np.empty(shape=[0, args.num_labels], dtype=np.int32) # metric counters for closed-world classification clip_acc = 0 video_top1 = 0 video_topk = 0 video_count = 0 clip_count = 0 crop_per_video = args.clip_per_video * args.crop_per_clip for i in range(num_iter): workspace.RunNet(test_model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): # get labels label = workspace.FetchBlob("gpu_{}".format(g) + '/label') # get predictions if args.multi_label: predicts = workspace.FetchBlob("gpu_{}".format(g) + '/prob') else: predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax') assert predicts.shape[0] == args.batch_size * crop_per_video for j in range(args.batch_size): # get label for one video if args.multi_label: sample_label = label[j * crop_per_video, :] else: sample_label = label[j * crop_per_video] # get clip accuracy for k in range(crop_per_video): sorted_preds = \ np.argsort(predicts[j * crop_per_video + k, :]) sorted_preds[:] = sorted_preds[::-1] if sorted_preds[0] == label[j * crop_per_video + k]: clip_acc = clip_acc + 1 # get all clip predictions for one video all_clips = \ predicts[ j * crop_per_video:(j + 1) * crop_per_video, : ] # aggregate predictions into one video_pred = PredictionAggregation(all_clips, args.aggregation) if args.multi_label: video_pred = np.expand_dims(video_pred, axis=0) sample_label = np.expand_dims(sample_label, axis=0) all_prob_for_map = np.concatenate( (all_prob_for_map, video_pred), axis=0) all_label_for_map = np.concatenate( (all_label_for_map, sample_label), axis=0) else: sorted_video_pred = np.argsort(video_pred) sorted_video_pred[:] = sorted_video_pred[::-1] if sorted_video_pred[0] == sample_label: video_top1 = video_top1 + 1 if sample_label in sorted_video_pred[0:args.top_k]: video_topk = video_topk + 1 video_count = video_count + args.batch_size clip_count = clip_count + label.shape[0] if i > 0 and i % args.display_iter == 0: if args.multi_label: # mAP auc, ap, wap, aps = metric.mean_ap_metric( all_prob_for_map, all_label_for_map) log.info( 'Iter {}/{}: mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}'. format(i, num_iter, auc, ap, wap, np.mean(aps))) else: # accuracy log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format( i, num_iter, clip_acc / clip_count, video_top1 / video_count, video_topk / video_count)) if args.multi_label: # mAP auc, ap, wap, aps = metric.mean_ap_metric(all_prob_for_map, all_label_for_map) log.info("Test mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}".format( auc, ap, wap, np.mean(aps))) if args.print_per_class_metrics: log.info("Test mAP per class: {}".format(aps)) else: # accuracy log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format( clip_acc / clip_count, video_top1 / video_count, args.top_k, video_topk / video_count)) if num_gpus > 0: flops, params, inters = model_helper.GetFlopsAndParams( test_model, gpus[0]) else: flops, params, inters = model_helper.GetFlopsAndParams(test_model) log.info('FLOPs: {}, params: {}, inters: {}'.format(flops, params, inters))
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustice_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="mobilenet", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) else: rendezvous = None # Model building functions def create_mobilenet_model_ops(model, loss_scale): [softmax, loss ] = mobilenet.create_mobilenet(model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label") loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) optimizer.build_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput(model, reader, batch_size=batch_per_device, img_size=args.image_size, is_test=False) # Create parallelized model data_parallel_model.Parallelize_GPU( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_mobilenet_model_ops, optimizer_builder_fun=add_optimizer, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, ) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # # # save network graph # graph = net_drawer.GetPydotGraphMinimal( # train_model.net.Proto().op, "mobilenet", rankdir="LR", minimal_dependency=True) # with open("mobilenet.png", 'wb') as fid: # fid.write(graph.create_png()) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="mobilenet_test", arg_scope=test_arg_scope) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput(model, test_reader, batch_size=batch_per_device, img_size=args.image_size, is_test=True) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_mobilenet_model_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and mobilenet epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # mobilenet epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("mobilenet epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "mobilenet_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def feature_extractor(load_model_path=None, test_data=None, gpu_list=None, num_gpus=0, batch_size=4, clip_per_video=1, decode_type=2, clip_length_rgb=4, sampling_rate_rgb=1, scale_h=128, scale_w=171, crop_size=112, video_res_type=0, num_decode_threads=4, multi_label=0, num_labels=101, input_type=0, clip_length_of=8, sampling_rate_of=2, frame_gap_of=2, do_flow_aggregation=0, flow_data_type=0, get_video_id=1, get_start_frame=0, use_local_file=1, crop_per_clip=1, db_type='pickle', model_name='r2plus1d', model_depth=18, num_channels=3, output_path=None, use_cudnn=1, layers='final_avg', num_iterations=1, channel_multiplier=1.0, bottleneck_multiplier=1.0, use_pool1=0, use_convolutional_pred=0, use_dropout=0, **kwargs): """ :param gpu_list: list of gpu ids to use :param batch_size: batch size :param clip_per_video: When clips_per_video > 1, sample this many clips uniformly in time :param decode_type: 0: random, 1: uniform sampling, 2: use starting frame :param clip_length_rgb: Length of input clips :param sampling_rate_rgb: Frame sampling rate :param scale_h: Scale image height to :param scale_w: Scale image width to :param crop_size: Input image size (to crop to) :param video_res_type: Video frame scaling option, 0: scaled by height x width; 1: scaled by short edge :param num_decode_threads: number of decoding threads :param multi_label: Multiple label csv file input :param num_labels: Number of labels :param input_type: 0=rgb, 1=optical flow :param clip_length_of: Frames of optical flow data :param sampling_rate_of: Sampling rate for optial flows :param frame_gap_of: Frame gap of optical flows :param do_flow_aggregation: whether to aggregate optical flow across multiple frames :param flow_data_type: 0=Flow2C, 1=Flow3C, 2=FlowWithGray, 3=FlowWithRGB :param get_video_id: Output video id :param get_start_frame: Output clip start frame :param use_local_file: use local file :param crop_per_clip: number of spatial crops per clip :param db_type: Db type of the testing model :param model_name: Model name :param model_depth: Model depth :param num_channels: Number of channels :param load_model_path: Load saved model for testing :param test_data: Path to output pickle; defaults to layers.pickle next to <test_data> :param output_path: Path to output pickle; defaults to layers.pickle next to <test_data> :param use_cudnn: Use CuDNN :param layers: Comma-separated list of blob names to fetch :param num_iterations: Run only this many iterations :param channel_multiplier: Channel multiplier :param bottleneck_multiplier: Bottleneck multiplier :param use_pool1: use pool1 layer :param use_convolutional_pred: using convolutional predictions :param use_dropout: Use dropout at the prediction layer """ if load_model_path is None or test_data is None: raise Exception('Model path AND test data need to be specified') # Initialize Caffe2 workspace.GlobalInit(['caffe2', '--caffe2_log_level=2']) if gpu_list is None: if num_gpus == 0: raise Exception('Must specify GPUs') else: gpus = [i for i in range(num_gpus)] else: gpus = gpu_list num_gpus = len(gpus) my_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True } model = cnn.CNNModelHelper(name="Extract features", **my_arg_scope) video_input_args = dict( batch_size=batch_size, clip_per_video=clip_per_video, decode_type=decode_type, length_rgb=clip_length_rgb, sampling_rate_rgb=sampling_rate_rgb, scale_h=scale_h, scale_w=scale_w, crop_size=crop_size, video_res_type=video_res_type, short_edge=min(scale_h, scale_w), num_decode_threads=num_decode_threads, do_multi_label=multi_label, num_of_class=num_labels, random_mirror=False, random_crop=False, input_type=input_type, length_of=clip_length_of, sampling_rate_of=sampling_rate_of, frame_gap_of=frame_gap_of, do_flow_aggregation=do_flow_aggregation, flow_data_type=flow_data_type, get_rgb=input_type == 0, get_optical_flow=input_type == 1, get_video_id=get_video_id, get_start_frame=get_start_frame, use_local_file=use_local_file, crop_per_clip=crop_per_clip, ) reader_args = dict( name="extract_features" + '_reader', input_data=test_data, ) reader, num_examples = reader_utils.create_data_reader( model, **reader_args) def input_fn(model): model_helper.AddVideoInput(model, reader, **video_input_args) def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=model_name, model_depth=model_depth, num_labels=num_labels, batch_size=batch_size, num_channels=num_channels, crop_size=crop_size, clip_length=(clip_length_of if input_type == 1 else clip_length_rgb), loss_scale=loss_scale, is_test=1, multi_label=multi_label, channel_multiplier=channel_multiplier, bottleneck_multiplier=bottleneck_multiplier, use_dropout=use_dropout, use_convolutional_pred=use_convolutional_pred, use_pool1=use_pool1, ) ## if num_gpus > 0: data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, # 'None' since we aren't training devices=gpus, optimize_gradient_memory=True, ) else: model._device_type = caffe2_pb2.CPU model._devices = [0] device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): input_fn(model) create_model_ops(model, 1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if db_type == 'pickle': model_loader.LoadModelFromPickleFile(model, load_model_path) elif db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(load_model_path, db_type) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(load_model_path, db_type) else: log.warning("Unsupported db_type: {}".format(db_type)) data_parallel_model.FinalizeAfterCheckpoint(model) ## def fetchActivations(model, outputs, num_iterations): all_activations = {} for counter in range(num_iterations): workspace.RunNet(model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): for output_name in outputs: blob_name = 'gpu_{}/'.format(g) + output_name activations = workspace.FetchBlob(blob_name) if output_name not in all_activations: all_activations[output_name] = [] all_activations[output_name].append(activations) # each key holds a list of activations obtained from each minibatch. # we now concatenate these lists to get the final arrays. # concatenating during the loop requires a realloc and can get slow. for key in all_activations: all_activations[key] = np.concatenate(all_activations[key]) return all_activations if not isinstance(layers, list): layers = [layers] if 'video_id' not in layers: layers.append('video_id') assert len(layers) > 0 examples_per_iteration = batch_size * num_gpus num_iterations = int(num_examples / examples_per_iteration) activations = fetchActivations(model, layers, num_iterations) # saving extracted layers for index in range(len(layers)): log.info("Read '{}' with shape {}".format( layers[index], activations[layers[index]].shape)) if output_path: log.info('Writing to {}'.format(output_path)) if save_h5: with h5py.File(output_path, 'w') as handle: for name, activation in activations.items(): handle.create_dataset(name, data=activation) else: with open(output_path, 'wb') as handle: pickle.dump(activations, handle) else: return activations
def Test(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) #======================================================== for db_name, db_size in data_base.items(): workspace.ResetWorkspace() print(workspace.Blobs()) def create_mobilenet_model_ops(model, loss_scale): [softmax, loss] = mobilenet.create_mobilenet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label") # loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") # return [loss] log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name="mobilenet_test", arg_scope=test_arg_scope #, init_params=False ) test_data_db = os.path.join(data_folder, db_name) print(test_data_db, db_size) test_reader = test_model.CreateDB( "test_reader", db=test_data_db, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=args.batch_size, img_size=args.image_size, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_mobilenet_model_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) # load the pre-trained model and mobilenet epoch LoadModel(args.load_model_path, test_model) data_parallel_model.FinalizeAfterCheckpoint(test_model) (test_max_softmax, test_max_index, test_label) = RunEpoch(args, db_size, test_model, args.batch_size) # flag_accuracy = (test_label == test_max_index).astype(np.int) # print(flag_accuracy.mean()) save_path = os.path.join(root_folder, db_name[:-5] + ".npz") print(save_path) np.savez(save_path, max_softmax=test_max_softmax, max_index=test_max_index, label=test_label) if os.path.exists(save_path): print("OK")