def Train(args): if args.model == "resnext": model_name = "resnext" + str(args.num_layers) elif args.model == "shufflenet": model_name = "shufflenet" # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Verify valid image mean/std per channel if args.image_mean_per_channel: assert \ len(args.image_mean_per_channel) == args.num_channels, \ "The number of channels of image mean doesn't match input" if args.image_std_per_channel: assert \ len(args.image_std_per_channel) == args.num_channels, \ "The number of channels of image std doesn't match input" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object if args.use_ideep: train_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, 'training_mode': 1 } else: train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name=model_name, arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions def create_resnext_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnext( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, num_layers=args.num_layers, num_groups=args.resnext_num_groups, num_width_per_group=args.resnext_width_per_group, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def create_shufflenet_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = shufflenet.create_shufflenet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-prceision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1 ) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, mean_per_channel=args.image_mean_per_channel, std_per_channel=args.image_std_per_channel, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnext_model_ops if args.model == "resnext" else create_shufflenet_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, ideep=args.use_ideep, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") if args.use_ideep: test_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, } else: test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name=model_name + "_test", arg_scope=test_arg_scope, init_params=False, ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, mean_per_channel=args.image_mean_per_channel, std_per_channel=args.image_std_per_channel, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnext_model_ops if args.model == "resnext" else create_shufflenet_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model, args.use_ideep) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % ( model_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog ) # Save the model for each epoch SaveModel(args, train_model, epoch, args.use_ideep) model_path = "%s/%s_" % ( args.file_store_path, args.save_model_name ) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def test_sum_reduce(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(4, 5).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=0) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(2, 3).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=3) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res, decimal=3) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(3, 4).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 500).astype(np.float64) Y = np.random.rand(1).astype(np.float64) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.array(np.sum(X)) np.testing.assert_array_almost_equal(out, res, decimal=0) # broadcasting with single elem dimensions at both ends X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(1, 3, 4, 1).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=2).reshape(Y.shape) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # fp64 is not supported with the CUDA op dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA] self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
def test_int8_fc_4_dims(self, n, m, k, gc, dc): X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5 w = np.random.rand(n, k, m, m).astype(np.float32) - 0.5 b = np.random.rand(n).astype(np.float32) - 0.5 fc_fp32 = core.CreateOperator('FC', ['X', 'w', 'b'], ["Y"]) old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X', X, dc[0]) workspace.FeedBlob('w', w, dc[0]) workspace.FeedBlob('b', b, dc[0]) workspace.RunOperatorOnce(fc_fp32) Y = workspace.FetchBlob('Y') workspace.ResetWorkspace() Y_absmax = np.array([np.absolute(Y).max()]).astype(np.float32) if Y.min() >= 0: Y_scale = Y_absmax / 0xFF Y_zero_point = 0 else: Y_scale = Y_absmax / 0x7F Y_zero_point = 128 X_absmax = np.array([np.absolute(X).max()]).astype(np.float32) if X.min() >= 0: X_scale = X_absmax / 0xFF X_zero_point = 0 else: X_scale = X_absmax / 0x7F X_zero_point = 128 w_absmax = np.array([ np.absolute(w[i, ...]).max() for i in range(w.shape[0]) ]).astype(np.float32) w_scale = w_absmax / 0x7F w_zero_point = 128 w = np.transpose(w, (0, 2, 3, 1)).astype(np.float32) w_bytes = np.rint([w[i, ...] / w_scale[i] for i in range(w.shape[0]) ]).astype(np.int8) + w_zero_point w_filler = core.CreateOperator( "Int8GivenTensorFill", [], ["wi"], shape=w.shape, values=w_bytes.astype(np.uint8).tobytes(), Y_zero_point=w_zero_point, Y_scales=w_scale, device_option=dc[1], ) b_scale = w_scale * X_scale b_zero_point = 0 b_bytes = np.rint([b[i] / b_scale[i] for i in range(b.shape[0])]).astype(np.int32) b_filler = core.CreateOperator( "Int8GivenIntTensorFill", [], ["bi"], shape=b.shape, values=b_bytes, Y_zero_point=b_zero_point, Y_scales=b_scale, device_option=dc[1], ) sw2nhwc = core.CreateOperator("NCHW2NHWC", ["Xi"], ["Xi_nhwc"], device_option=dc[1]) quantize_X = core.CreateOperator( "Int8Quantize", ["Xi_nhwc"], ["Xi_quantized"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=X_zero_point, Y_scale=X_scale[0], ) fc = core.CreateOperator( 'Int8FC', ['Xi_quantized', 'wi', 'bi'], ["Y_out"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=Y_zero_point, Y_scale=Y_scale[0], ) net = caffe2_pb2.NetDef() net.op.extend([w_filler, b_filler, sw2nhwc, quantize_X, fc]) workspace.FeedBlob("Xi", X, dc[1]) workspace.RunNetOnce(net) Y_out = workspace.FetchBlob("Y_out") MSE = np.square(np.subtract(Y, Y_out)).mean() if MSE > 0.005: print(Y.flatten()) print(Y_out.flatten()) print(np.max(np.abs(Y_out - Y))) print("MSE", MSE) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_int8_pooling(self, stride, pad, kernel, size, input_channels, batch_size, method, gc, dc): assume(pad < kernel) pool_fp32 = core.CreateOperator(method, ["X"], ["Y"], stride=stride, pad=pad, kernel=kernel, device_option=dc[0]) X = np.random.rand(batch_size, input_channels, size, size).astype(np.float32) if X.min() >= 0: scale = np.absolute(X).max() / 0xFF zero_point = 0 else: scale = np.absolute(X).max() / 0x7F zero_point = 128 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob("X", X, dc[0]) workspace.RunOperatorOnce(pool_fp32) Y = workspace.FetchBlob("Y") workspace.ResetWorkspace() sw2nhwc = core.CreateOperator("NCHW2NHWC", ["Xi"], ["Xi_nhwc"], device_option=dc[1]) quantize = core.CreateOperator( "Int8Quantize", ["Xi_nhwc"], ["Xi_quantized"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=zero_point, Y_scale=scale, ) pool = core.CreateOperator( "Int8{}".format(method), ["Xi_quantized"], ["Y_quantized"], stride=stride, pad=pad, kernel=kernel, engine="DNNLOWP", device_option=dc[1], ) dequantize = core.CreateOperator( "Int8Dequantize", ["Y_quantized"], ["Y_nhwc"], engine="DNNLOWP", device_option=dc[1], ) sw2nchw = core.CreateOperator("NHWC2NCHW", ["Y_nhwc"], ["Y_out"], device_option=dc[1]) net = caffe2_pb2.NetDef() net.op.extend([sw2nhwc, quantize, pool, dequantize, sw2nchw]) workspace.FeedBlob("Xi", X, dc[1]) workspace.RunNetOnce(net) Y_out = workspace.FetchBlob("Y_out") MSE = np.square(np.subtract(Y, Y_out)).mean() if MSE > 0.005: print(Y.flatten()) print(Y_out.flatten()) print(np.max(np.abs(Y_out - Y))) print("MSE", MSE) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_hsm_search(self): samples = 10 dim_in = 5 X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5 w = np.random.rand(hierarchy_proto.size, dim_in) \ .astype(np.float32) - 0.5 b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5 labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \ .astype(np.int32) workspace.GlobalInit(['caffe2']) workspace.FeedBlob("data", X) workspace.FeedBlob("weights", w) workspace.FeedBlob("bias", b) workspace.FeedBlob("labels", labels) op = core.CreateOperator('HSoftmaxSearch', ['data', 'weights', 'bias'], ['names', 'scores'], 'HSoftmaxSearch', arg=args_search) workspace.RunOperatorOnce(op) names = workspace.FetchBlob('names') scores = workspace.FetchBlob('scores') def simulation_hsm_search(): names = [] scores = [] for line in struct: s, e = line[0], line[0] + line[1] score = np.dot(X, w[s:e].transpose()) + b[s:e] score = np.exp(score - np.max(score, axis=1, keepdims=True)) score /= score.sum(axis=1, keepdims=True) score = -np.log(score) score = score.transpose() idx = -1 for j, n in enumerate(names): if n == line[3]: idx = j score += scores[j] if idx == -1: score[score > beam] = np.inf else: score[score - scores[idx] > beam] = np.inf for i, name in enumerate(line[2]): scores.append(score[i]) names.append(name) scores = np.vstack(scores) return names, scores.transpose() p_names, p_scores = simulation_hsm_search() idx = np.argsort(p_scores, axis=1) p_scores = np.sort(p_scores, axis=1) p_names = np.array(p_names)[idx] for i in range(names.shape[0]): for j in range(names.shape[1]): if names[i][j]: self.assertEquals(names[i][j], p_names[i][j].item().encode('utf-8')) self.assertAlmostEqual(scores[i][j], p_scores[i][j], delta=0.001)
def Run(args, extra_args): """main func of run inference""" if not m.IsSupported(args.model): logging.error("Not supported model: {}".format(args.model)) m.ShowModels() return images_path = None if args.images_path: images_path = os.path.abspath(args.images_path) elif "CAFFE2_INF_IMG_PATH" in os.environ: images_path = os.path.abspath(os.environ["CAFFE2_INF_IMG_PATH"]) if not args.dummydata and not os.path.isdir(images_path): logging.error("Can not find image path {}.".format(images_path)) return labels = None validation = None if args.label_file: labels = cc2.LoadLabels(args.label_file) elif args.validation_file: validation = cc2.LoadValidation(args.validation_file) elif "CAFFE2_INF_LABEL_FILE" in os.environ: labels = cc2.LoadLabels(os.environ["CAFFE2_INF_LABEL_FILE"]) elif "CAFFE2_INF_VAL_FILE" in os.environ: validation = cc2.LoadValidation(os.environ["CAFFE2_INF_VAL_FILE"]) else: logging.warning("No validation or label file!") if args.annotations: apath = args.annotations elif args.model == 'faster-rcnn' or args.model == 'ssd': logging.error( "currently only support fasterrcnn and ssd for voc dataset, so will just collect performance" ) iterations = args.iterations if args.iterations else sys.maxsize warmup_iter = args.warmup_iterations if args.warmup_iterations > 0 else 0 optimization = [] if args.optimization: optimization = [opt.strip() for opt in args.optimization.split(',')] batch_size = 1 if args.batch_size: batch_size = int(args.batch_size) if batch_size <= 0: logging.error("Invalid batch size {}. Exit!".format(batch_size)) return logging.warning("Run Caffe2 in inference mode with args:\n{}".format( vars(args))) model_info = m.GetModelInfo(args.model) logging.warning("The inference inputs of {0} model:\n{1}".format( args.model, {str(k): str(v) for k, v in model_info.items()})) crop_size = int(model_info["crop_size"]) if args.crop_size: crop_size = args.crop_size need_normalize = False if model_info["need_normalize"]: need_normalize = True mean = 128 if str(model_info["image_mean"]) != 'None': mean_tmp = ((model_info["image_mean"]).split('/')[-1]).split(' ') if need_normalize: mean = np.zeros([3, crop_size, crop_size], dtype=np.float) mean[0, :, :] = float(mean_tmp[0]) # 104 mean[1, :, :] = float(mean_tmp[1]) # 117 mean[2, :, :] = float(mean_tmp[2]) # 124 else: mean = np.zeros([3, crop_size, crop_size], dtype=np.int32) mean[0, :, :] = int(mean_tmp[0]) # 104 mean[1, :, :] = int(mean_tmp[1]) # 117 mean[2, :, :] = int(mean_tmp[2]) # 124 scale = [1] if str(model_info["scale"]) != '': scale = (model_info["scale"]).split(' ') rescale_size = 256 if str(model_info["rescale_size"]) != '': rescale_size = int(model_info["rescale_size"]) color_format = "BGR" if str(model_info["color_format"]) != '': color_format = model_info["color_format"] model_start_time = timeit.default_timer() if args.onnx_model: init_def, predict_def = cc2.OnnxToCaffe2(model_info["onnx_model"]) else: if args.int8_model or args.int8_cosim: init_file = model_info["init_net_int8"] predict_file = model_info["predict_net_int8"] else: init_file = model_info["init_net"] predict_file = model_info["predict_net"] with open(init_file) as i: if model_info["model_type"] == "prototext" or model_info[ "init_net"].split('.')[-1] == "pbtxt": import google.protobuf.text_format as ptxt init_def = ptxt.Parse(i.read(), caffe2_pb2.NetDef()) else: init_def = caffe2_pb2.NetDef() init_def.ParseFromString(i.read()) with open(predict_file) as p: if model_info["model_type"] == "prototext" or predict_file.split( '.')[-1] == "pbtxt": import google.protobuf.text_format as ptxt predict_def = ptxt.Parse(p.read(), caffe2_pb2.NetDef()) else: predict_def = caffe2_pb2.NetDef() predict_def.ParseFromString(p.read()) if args.int8_cosim: with open(model_info["predict_net"]) as p: if model_info["model_type"] == "prototext" or model_info[ "predict_net"].split('.')[-1] == "pbtxt": import google.protobuf.text_format as ptxt cosim_predict_def = ptxt.Parse(p.read(), caffe2_pb2.NetDef()) else: cosim_predict_def = caffe2_pb2.NetDef() cosim_predict_def.ParseFromString(p.read()) #cc2.SaveAsOnnxModel(init_def, predict_def, (1, 3, crop_size, crop_size), # model_info["model_name"] + "_onnx.pb") if model_info["model_type"] == "caffe legacy": cc2.MergeScaleBiasInBN(predict_def) cc2.RemoveUselessExternalInput(predict_def) if args.int8_cosim: cc2.MergeScaleBiasInBN(cosim_predict_def) cc2.RemoveUselessExternalInput(cosim_predict_def) dev_map = { "cpu": caffe2_pb2.CPU, "gpu": caffe2_pb2.CUDA, "cuda": caffe2_pb2.CUDA, "mkldnn": caffe2_pb2.MKLDNN, "opengl": caffe2_pb2.OPENGL, "opencl": caffe2_pb2.OPENCL, "ideep": caffe2_pb2.IDEEP, } device_opts = caffe2_pb2.DeviceOption() if args.device.lower() in dev_map: device_opts.device_type = dev_map[args.device.lower()] else: logging.error("Wrong device {}. Exit!".format(args.device)) return device_opts_cpu = caffe2_pb2.DeviceOption() device_opts_cpu.device_type = caffe2_pb2.CPU if model_info["allow_device_override"]: if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'): cc2.UpdateDeviceOption(device_opts_cpu, init_def) else: cc2.UpdateDeviceOption(device_opts, init_def) if model_info["allow_device_override"]: cc2.UpdateDeviceOption(device_opts, predict_def) # search params shape to replace the 0 with 1 when ideep and throw warning if args.device.lower() == 'ideep': cc2.FillZeroParamsWithOne(init_def) init_data = np.random.rand(batch_size, 3, crop_size, crop_size).astype(np.float32) init_label = np.ones((batch_size), dtype=np.int32) if args.cosim: def_ws_name = ws.CurrentWorkspace() inf_ws_name = "__inf_ws__" ws.SwitchWorkspace(inf_ws_name, True) ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts) ws.RunNetOnce(init_def) cosim_ws_name = "__cosim_ws__" ws.SwitchWorkspace(cosim_ws_name, True) device_cosim = caffe2_pb2.DeviceOption() device_cosim.device_type = dev_map["cpu"] cosim_init_def = copy.deepcopy(init_def) cc2.UpdateDeviceOption(device_cosim, cosim_init_def) ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_cosim) ws.RunNetOnce(cosim_init_def) cosim_predict_def = copy.deepcopy(predict_def) cc2.UpdateDeviceOption(device_cosim, cosim_predict_def) elif args.int8_cosim: inf_ws_name = "__int8_ws__" ws.SwitchWorkspace(inf_ws_name, True) ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts) ws.RunNetOnce(init_def) net = core.Net(model_info["model_name"]) net.Proto().CopyFrom(predict_def) tf.optimizeForIDEEP(net) predict_def = net.Proto() cosim_ws_name = "__fp32_ws__" ws.SwitchWorkspace(cosim_ws_name, True) ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), init_data, device_opts) ws.RunNetOnce(init_def) cc2.UpdateDeviceOption(device_opts, cosim_predict_def) net = core.Net(model_info["model_name"]) net.Proto().CopyFrom(cosim_predict_def) tf.optimizeForIDEEP(net) cosim_predict_def = net.Proto() else: # ApplyOptimizations(init_def, predict_def, model_info, optimization) ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts) if os.environ.get('DEBUGMODE') == "1": cc2.SetOpName(predict_def) ws.RunNetOnce(init_def) net = core.Net(model_info["model_name"]) net.Proto().CopyFrom(predict_def) if args.device.lower() == 'ideep' and not args.noptimize: logging.warning('Optimizing module {} ....................'.format( model_info["model_name"])) tf.optimizeForIDEEP(net) predict_def = net.Proto() # ws.CreateNet(predict_def) if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'): new_predict_def, _ = core.InjectCrossDeviceCopies( core.Net(predict_def)) net = core.Net(new_predict_def._net) #ws.CreateNet(new_predict_def._net) predict_def = new_predict_def._net if os.environ.get('DEBUGMODE') == "1": with open( "{0}_opt_predict_net.pb".format(model_info["model_name"]), "w") as fid: fid.write(predict_def.SerializeToString()) with open( "{}_opt_predict_net.pbtxt".format( model_info["model_name"]), "w") as fid: fid.write(str(predict_def)) if args.profile or predict_def.op[-1].type == 'Accuracy': #predict_model = model_helper.ModelHelper("predict") #predict_model.net = core.Net(predict_def) #predict_model.net.name = predict_def.name if predict_def.op[-1].type == 'Accuracy': label = net.AddExternalInput('label') if args.device.lower() == 'gpu': ws.FeedBlob(label, init_label, device_opts) else: ws.FeedBlob(label, init_label, device_opts_cpu) for i, op in enumerate(predict_def.op): if op.type == 'Accuracy': if args.device.lower() == 'gpu': print(device_opts.device_type) ws.FeedBlob(str(predict_def.op[i].output[0]), init_label, device_opts) else: ws.FeedBlob(str(predict_def.op[i].output[0]), init_label, device_opts_cpu) #if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'): # ws.CreateNet(net, True) #else: ws.CreateNet(net) if args.profile: #ob = predict_model.net.AddObserver("TimeObserver") ob = net.AddObserver("TimeObserver") else: #if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'): # ws.CreateNet(net, True) #else: ws.CreateNet(net) model_elapsed_time = timeit.default_timer() - model_start_time outputs = [] accuracy_top1 = [] accuracy_top5 = [] img_time = 0 comp_time = 0 processed_images = 0 images = [] labels = [] fnames = [] if args.dummydata: init_label = np.ones((batch_size), dtype=np.int32) imgs = np.random.rand(batch_size, 3, crop_size, crop_size).astype(np.float32) for i in range(iterations): labels.append(init_label) images.append(imgs) else: process_data_start_time = timeit.default_timer() images, fnames = cc2.ImageProc.BatchImages(images_path, batch_size, iterations) process_data_elapsed_time = timeit.default_timer( ) - process_data_start_time logging.warning( "processdata time = {}".format(process_data_elapsed_time)) logging.warning("Start warmup {} iterations...".format(warmup_iter)) forchw = 1 if 'style-transfer' in args.model: forchw = 0 wi = warmup_iter - 1 while warmup_iter and not args.cosim: warmup_iter -= 1 if args.dummydata: imgs = images[wi - warmup_iter] oshape = (crop_size, crop_size, 3) else: r = randint(0, len(images) - 1) imgs, oshape = cc2.ImageProc.PreprocessImages( images[r], crop_size, rescale_size, mean, scale, forchw, need_normalize, color_format) #imgs, oshape = cc2.ImageProc.PreprocessImagesByThreading( # images[r], crop_size, rescale_size, mean, scale, forchw) if args.model == 'faster-rcnn': # init_def_update=copy.deepcopy(init_def) # cc2.UpdateImgInfo(oshape, init_def_update, predict_def, crop_size) # ws.RunNetOnce(init_def_update) im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def, crop_size) if args.device.lower() == 'gpu': ws.FeedBlob(im_info_name, blob, device_opts_cpu) else: ws.FeedBlob(im_info_name, blob, device_opts) if 'style-transfer' in args.model or (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'): ws.FeedBlob(str(predict_def.op[0].input[0]), imgs) else: ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts) if predict_def.op[-1].type == 'Accuracy' and len(validation) > 0: batch_fname = fnames[r] init_label = np.ones((len(fnames[r])), dtype=np.int32) for j in range(len(fnames[r])): init_label[j] = validation[batch_fname[j]] if args.device.lower() == 'gpu': ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label, device_opts) ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label, device_opts) else: ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label, device_opts_cpu) ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label, device_opts_cpu) #if args.profile or predict_def.op[-1].type == 'Accuracy': # ws.RunNet(net) #else: ws.RunNet(net) logging.warning("Start running performance") for k, raw in enumerate(images): processed_images += len(raw) img_start_time = timeit.default_timer() if args.dummydata: imgs = raw oshape = (crop_size, crop_size) else: imgs, oshape = cc2.ImageProc.PreprocessImages( raw, crop_size, rescale_size, mean, scale, forchw, need_normalize, color_format) #imgs, oshape = cc2.ImageProc.PreprocessImagesByThreading(raw, crop_size, rescale_size, mean, scale, forchw) # im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def, crop_size) # ws.FeedBlob(im_info_name, blob, device_opts) # x = ws.FetchBlob(im_info_name) init_label = None if predict_def.op[-1].type == 'Accuracy' and args.dummydata: init_label = labels[k] elif predict_def.op[-1].type == 'Accuracy' and len(validation) > 0: batch_fname = fnames[k] init_label = np.ones((len(fnames[k])), dtype=np.int32) for j in range(len(fnames[k])): init_label[j] = validation[batch_fname[j]] if args.model == 'faster-rcnn': # init_def_update=copy.deepcopy(init_def) # cc2.UpdateImgInfo(oshape, init_def_update, predict_def, crop_size) im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def, crop_size) if args.cosim: ws.SwitchWorkspace(inf_ws_name, True) # ws.RunNetOnce(init_def_update) ws.FeedBlob(im_info_name, blob, device_opts) ws.SwitchWorkspace(cosim_ws_name, True) # cosim_init_def_update=copy.deepcopy(cosim_init_def) # cc2.UpdateImgInfo(oshape, cosim_init_def_update, cosim_predict_def, crop_size) # ws.RunNetOnce(cosim_init_def_update) ws.FeedBlob(im_info_name, blob, device_cosim) else: # ws.RunNetOnce(init_def_update) if args.device.lower() == 'gpu': ws.FeedBlob(im_info_name, blob, device_opts_cpu) else: ws.FeedBlob(im_info_name, blob, device_opts) # logging.info("output blob is: {}".format(x)) # imgs = ImageProc.PreprocessImages(raw, crop_size, mean) img_elapsed_time = timeit.default_timer() - img_start_time img_time += img_elapsed_time if args.cosim or args.int8_cosim: ws.SwitchWorkspace(cosim_ws_name) if args.cosim: ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), imgs, device_cosim) else: ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), imgs, device_opts) ws.SwitchWorkspace(inf_ws_name) ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts) for i in range(len(predict_def.op)): ws.SwitchWorkspace(inf_ws_name) inf_inputs = [] for inp in predict_def.op[i].input: inf_inputs.append(ws.FetchBlob(str(inp))) ws.RunOperatorOnce(predict_def.op[i]) inf_results = [] for res in predict_def.op[i].output: inf_results.append(ws.FetchBlob(str(res))) ws.SwitchWorkspace(cosim_ws_name) cosim_inputs = [] for inp in cosim_predict_def.op[i].input: cosim_inputs.append(ws.FetchBlob(str(inp))) ws.RunOperatorOnce(cosim_predict_def.op[i]) cosim_results = [] for res in cosim_predict_def.op[i].output: cosim_results.append(ws.FetchBlob(str(res))) if len(inf_inputs) != len(cosim_inputs): logging.error("Wrong number of inputs") if len(inf_results) != len(cosim_results): logging.error("Wrong number of outputs") return if args.cosim: tol = {'atol': 1e-02, 'rtol': 1e-03} else: tol = {'atol': 5, 'rtol': 1e-01} logging.warning("begin to check op[{}] {} input".format( i, predict_def.op[i].type)) for k in range(len(inf_inputs)): if predict_def.op[i].input[k][0] == '_': continue #cc2.assert_allclose(inf_inputs[k], cosim_inputs[k], **tol) #if not np.allclose(inf_inputs[k], cosim_inputs[k], **tol): # logging.error("Failure in cosim {} op {} input {}" # .format( # i, # predict_def.op[i].type, # predict_def.op[i].input[k])) # logging.error(inf_inputs[k].flatten()) # logging.error(cosim_inputs[k].flatten()) # logging.error("Max error: {}" # .format( # np.max(np.abs( # inf_inputs[k] - cosim_inputs[k])))) # return logging.warning("pass checking op[{0}] {1} input".format( i, predict_def.op[i].type)) logging.warning("begin to check op[{0}] {1} output".format( i, predict_def.op[i].type)) for j, _ in enumerate(inf_results): if predict_def.op[i].output[j][0] == '_': continue if args.cosim: if not cc2.assert_allclose(inf_results[j], cosim_results[j], **tol): logging.error( "failed checking op[{0}] {1} output".format( i, predict_def.op[i].type)) exit() if args.int8_cosim: cc2.assert_allclose(inf_results[j], cosim_results[j], **tol) cc2.assert_compare(inf_results[j], cosim_results[j], 1e-01, 'ALL') #if not np.allclose(inf_results[j], cosim_results[j], **tol): # logging.error("Failure in cosim {} op {} output {}" # .format( # i, # predict_def.op[i].type, # predict_def.op[i].output[j])) # logging.error(inf_results[j].flatten()) # logging.error(cosim_results[j].flatten()) # logging.error("Max error: {}" # .format( # np.max(np.abs( # inf_results[j] - cosim_results[j])))) # return logging.warning("pass checking op[{0}] {1} output".format( i, predict_def.op[i].type)) else: if 'style-transfer' in args.model or (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'): ws.FeedBlob(str(predict_def.op[0].input[0]), imgs) else: ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts) if predict_def.op[-1].type == 'Accuracy': if args.device.lower() == 'gpu': ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label, device_opts) if predict_def.op[-2].type == 'Accuracy': ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label, device_opts) elif predict_def.op[-3].type == 'Accuracy': ws.FeedBlob(str(predict_def.op[-3].input[1]), init_label, device_opts) else: ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label, device_opts_cpu) if predict_def.op[-2].type == 'Accuracy': ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label, device_opts_cpu) elif predict_def.op[-3].type == 'Accuracy': ws.FeedBlob(str(predict_def.op[-3].input[1]), init_label, device_opts_cpu) comp_start_time = timeit.default_timer() #if args.profile or predict_def.op[-1].type == 'Accuracy': # ws.RunNet(net) #else: ws.RunNet(net) comp_elapsed_time = timeit.default_timer() - comp_start_time comp_time += comp_elapsed_time output = ws.FetchBlob(str(predict_def.op[-1].output[0])) if predict_def.op[-2].type == 'Accuracy': output2 = ws.FetchBlob(str(predict_def.op[-2].output[0])) elif predict_def.op[-3].type == 'Accuracy': output2 = ws.FetchBlob(str(predict_def.op[-3].output[0])) elif predict_def.op[-1].type == 'BoxWithNMSLimit': output2 = ws.FetchBlob(str(predict_def.op[-1].output[1])) output3 = ws.FetchBlob(str(predict_def.op[-1].output[2])) logging.warning( "[{0:.2%}] Output shape: {1}, computing in {2:.10f}" " seconds, processing {3} images in {4:.10f} seconds.".format( ((k + 1) / len(images)), output.shape, comp_elapsed_time, len(raw), img_elapsed_time)) if predict_def.op[-1].type == 'BoxWithNMSLimit': outputs.append([output, output2, output3]) elif predict_def.op[-1].type != 'Accuracy': outputs.append(output) else: accuracy_top1.append(output2) accuracy_top5.append(output) if args.profile: logging.warning("observer time = {}".format(ob.average_time())) logging.warning("observer time = {}".format( ob.average_time_children())) del imgs if k >= (iterations - 1): logging.warning( "Exit after running {} iterations".format(iterations)) break if args.profile: net.RemoveObserver(ob) if args.cosim: ws.SwitchWorkspace(def_ws_name) logging.info("Cosim passed") return if comp_time <= 0: logging.error("The total time is invalid!") return info_str = "" if len(accuracy_top1) > 0: mean_accuracy_top1 = 0 mean_accuracy_top5 = 0 for i, _ in enumerate(accuracy_top1): mean_accuracy_top1 += accuracy_top1[i] * batch_size mean_accuracy_top5 += accuracy_top5[i] * batch_size mean_accuracy_top1 /= batch_size * len(accuracy_top1) mean_accuracy_top5 /= batch_size * len(accuracy_top5) info_str += "\nAccuracy: {:.5%}".format(mean_accuracy_top1) info_str += "\nTop5Accuracy: {:.5%}".format(mean_accuracy_top5) total_image = processed_images logging.critical( "\nImages per second: {0:.10f}\nTotal computing time:" " {1:.10f} seconds\nTotal image processing time: {2:.10f} seconds\n" "Total model loading time: {3:.10f} seconds\nTotal images: {4}{5}". format(total_image / comp_time, comp_time, img_time, model_elapsed_time, total_image, info_str)) return if args.annotations: logging.info(" the total length of outputs is {}".format(len(outputs))) logging.critical("result is ={}".format( cc2.prepare_and_compute_map_data(outputs, fnames, apath))) info_str = "" accuracy = None top5accuracy = None summary = None if model_info["output_type"] == "segmentation" or args.dummydata: total_image = processed_images elif model_info["output_type"] == "possibility": results, total_image = cc2.ParsePossOutputs(outputs) summary = cc2.ParsePossResults(results, labels, validation, fnames) if not summary: logging.error("Failed to parse the results!") return elif total_image <= 0 or len(summary) != total_image: logging.error("No available results!") return if validation: accuracy = 0 top5accuracy = 0 for res in summary: if res[1] == "Pass": accuracy += 1 top5accuracy += 1 elif res[1] == "Top5Pass": top5accuracy += 1 accuracy = accuracy / total_image top5accuracy = top5accuracy / total_image info_str += "\nAccuracy: {:.5%}".format(accuracy) info_str += "\nTop5Accuracy: {:.5%}".format(top5accuracy) elif model_info["output_type"] == "post image": results, total_image = cc2.ParsePostOutputs(outputs) if args.post_images_path: cc2.SavePostImages(results, args.post_images_path, fnames) logging.critical( "\nImages per second: {0:.10f}\nTotal computing time:" " {1:.10f} seconds\nTotal image processing time: {2:.10f} seconds\n" "Total model loading time: {3:.10f} seconds\nTotal images: {4}{5}". format(total_image / comp_time, comp_time, img_time, model_elapsed_time, total_image, info_str)) cc2.SaveOutput(args, summary, accuracy, top5accuracy, comp_time, total_image, img_time, model_elapsed_time)
def bmuf_process(filestore_dir, process_id, shared_results, nesterov=False): # We need to import caffe2 in every process to initialize CUDA independently. from caffe2.python import core, cnn, data_parallel_model, workspace, dyndep from caffe2.proto import caffe2_pb2 dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops") if not workspace.has_gpu_support: log.info('No GPU support test is Ignored.') return if workspace.NumCudaDevices() < 4: log.info('Not enough GPU support, test IGNORED') return model = cnn.CNNModelHelper(order="NHWC", name="test") gpu_ids = [0, 1] if process_id == 0 else [2, 3] def _model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def _input_builder_fun(model): return None def _param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) def _generate_data(gpu_devices, process_id): np.random.seed(26 + process_id * 10) # Each run has same input, independent of number of gpus batch_size = 64 for _ in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/data".format(g), data) workspace.FeedBlob("gpu_{}/label".format(g), labels) _generate_data(gpu_ids, process_id) workspace.RunOperatorOnce( core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir)) rendezvous = dict(kv_handler="store_handler", shard_id=process_id, num_shards=2, engine="GLOO", exit_nets=None) data_parallel_model.Parallelize_GPU_BMUF( model, _input_builder_fun, _model_build_fun, _param_update_fun, devices=gpu_ids, rendezvous=rendezvous, nesterov=nesterov, add_blobs_to_sync=["sync_num"], ) data_parallel_model.RunInitNet(model) def _gpu_pid(gpu_id, pid): if pid == 1: return gpu_id + 2 return gpu_id np.testing.assert_equal( workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))), np.zeros(16).astype(np.float32).reshape(1, 16)) # Run the algorithm for one iteration to have non-zero params. data_parallel_model.RunNet(model, 1) # Save iteration momentum and post local update params results = {} v_b_ = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id))) v_w_ = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))) results['v_b_'] = v_b_ results['v_w_'] = v_w_ workspace.RunNetOnce(model.net) b_0_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id))) w_0_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id))) b_1_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id))) w_1_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id))) results['b_0_'] = b_0_ results['w_0_'] = w_0_ results['b_1_'] = b_1_ results['w_1_'] = w_1_ # Test sync if process_id == 0: workspace.FeedBlob(model._device_prefix + "_0/sync_num", np.array([2603]).astype(np.float32), device_option=core.DeviceOption( model._device_type, 0)) # Compute block gradients. b_g_ = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id))) w_g_ = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id))) results['b_g_'] = b_g_ results['w_g_'] = w_g_ workspace.RunNetOnce(model._global_model_param_updates_net) # g_b = (b_0_ + b_1_) / 2 - b_g_ # g_w = (w_0_ + w_1_) / 2 - w_g_ v_b = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id))) v_w = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))) w_g = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id))) b_g = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id))) w_0 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id))) b_0 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id))) w_1 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id))) b_1 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id))) results['v_b'] = v_b results['v_w'] = v_w results['w_g'] = w_g results['b_g'] = b_g results['w_0'] = w_0 results['b_0'] = b_0 results['w_1'] = w_1 results['b_1'] = b_1 # Test add_blobs_to_sync for j in model._devices: sync = workspace.FetchBlob(model._device_prefix + "_{}/sync_num".format(j))[0] results['sync_{}'.format(j)] = sync shared_results[process_id] = results
def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, gc, dc): conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0] ) sum = core.CreateOperator( "Sum", ["S0", "Y0"], ["S0"], device_option=dc[0] ) relu = core.CreateOperator( "Relu", ["S0"], ["S0"], device_option=dc[0] ) conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"], ["S1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type = 3, device_option=dc[1] ) X = np.random.rand( batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv) Y0 = workspace.FetchBlob('Y0') S = np.random.rand(*Y0.shape).astype(np.float32) - 0.5 workspace.FeedBlob('S0', S, dc[0]) workspace.RunOperatorOnce(sum) workspace.RunOperatorOnce(relu) S0 = workspace.FetchBlob('S0') workspace.ResetWorkspace() workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.FeedBlob('S1', S, dc[1]) workspace.RunOperatorOnce(conv_fusion) S1 = workspace.FetchBlob('S1') if not np.allclose(S0, S1, atol=0.01, rtol=0.01): print(S1.flatten()) print(S0.flatten()) print(np.max(np.abs(S1 - S0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def test_convolution_relu_fusion(self, stride, pad, kernel, size, input_channels, output_channels, batch_size, use_bias, group, gc, dc): conv = core.CreateOperator( "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"], stride=stride, pad=pad, kernel=kernel, group=group, device_option=dc[0] ) relu = core.CreateOperator( "Relu", ["Y0"], ["Y0"], device_option=dc[0] ) # Manual fusion conv_fusion = core.CreateOperator( "ConvFusion", ["X1", "w1", "b1"] if use_bias else ["X1", "w1"], ["Y1"], stride=stride, pad=pad, kernel=kernel, group=group, fusion_type = 1, device_option=dc[1] ) # Auto fusion old_net = caffe2_pb2.NetDef() conv_old = caffe2_pb2.OperatorDef() conv_old.CopyFrom(conv) conv_old.device_option.CopyFrom(dc[1]) relu_old = caffe2_pb2.OperatorDef() relu_old.CopyFrom(relu) relu_old.device_option.CopyFrom(dc[1]) old_net.op.extend([conv_old, relu_old]) net = core.Net("net") net.Proto().CopyFrom(old_net) optimizeForIDEEP(net) self.assertTrue(len(net.Proto().op) == 1) self.assertTrue(net.Proto().op[0].type == "ConvFusion") X = np.random.rand( batch_size, input_channels * group, size, size).astype(np.float32) - 0.5 w = np.random.rand( output_channels * group, input_channels, kernel, kernel) \ .astype(np.float32) - 0.5 b = np.random.rand(output_channels * group).astype(np.float32) - 0.5 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X0', X, dc[0]) workspace.FeedBlob('w0', w, dc[0]) workspace.FeedBlob('b0', b, dc[0]) workspace.RunOperatorOnce(conv) workspace.RunOperatorOnce(relu) Y0 = workspace.FetchBlob('Y0') workspace.ResetWorkspace() workspace.FeedBlob('X1', X, dc[1]) workspace.FeedBlob('w1', w, dc[1]) workspace.FeedBlob('b1', b, dc[1]) workspace.RunOperatorOnce(conv_fusion) Y1 = workspace.FetchBlob('Y1') if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): print(Y1.flatten()) print(Y0.flatten()) print(np.max(np.abs(Y1 - Y0))) self.assertTrue(False) workspace.ResetWorkspace() workspace.FeedBlob('X0', X, dc[1]) workspace.FeedBlob('w0', w, dc[1]) workspace.FeedBlob('b0', b, dc[1]) workspace.RunOperatorOnce(net.Proto().op[0]) Y2 = workspace.FetchBlob('Y0') if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01): print(Y2.flatten()) print(Y0.flatten()) print(np.max(np.abs(Y2 - Y0))) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
# ------------------------------------------------------------------------------------ # Create an operator. op = core.CreateOperator( "Relu", # The type of operator that we want to run ["X"], # A list of input blobs by their names ["Y"], # A list of output blobs by their names ) # and we are done! print("Type of the created op is: {}".format(type(op))) print("Content:\n") print(str(op)) workspace.FeedBlob("X", np.random.randn(2, 3).astype(np.float32)) workspace.RunOperatorOnce(op) print("Current blobs in the workspace: {}\n".format(workspace.Blobs())) print("X:\n{}\n".format(workspace.FetchBlob("X"))) print("Y:\n{}\n".format(workspace.FetchBlob("Y"))) print("Expected:\n{}\n".format(np.maximum(workspace.FetchBlob("X"), 0))) op = core.CreateOperator( "GaussianFill", [], # GaussianFill does not need any parameters. ["Z"], shape=[100, 100], # shape argument as a list of ints. mean=1.0, # mean as a single float std=1.0, # std as a single float ) print("Content of op:\n")
def test_swish_int8(self): np.random.seed(0) workspace.ResetWorkspace() n = 256 X_fp32 = np.linspace(-20.5, 8., num=n).astype(np.float32).reshape(1, n) Y_fp32 = self._swish(X_fp32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32) W_fp32 = np.identity(n, dtype=np.float32) b_fp32 = np.zeros((n,), dtype=np.float32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W"], ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, ) ) ref_net1 = core.Net("net") ref_net1.Int8QuantizeNNPI( ["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point ) ref_net1.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["U_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net1.SwishFakeInt8NNPI( ["U_int8"], ["Y"], X_scale=X_scale, X_zero_point=X_zero_point, Y_scale=Y_scale, Y_zero_point=Y_zero_point ) ref_net1.Proto().external_output.append("Y") ref_net = core.Net("net") ref_net.Int8QuantizeNNPI( ["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point ) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["U_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net.Int8DequantizeNNPI( ["U_int8"], ["U_fp16"], UsingOneOverScale=False ) ref_net.SwishFakeFp16NNPI( ["U_fp16"], ["Y_fp16"] ) ref_net.Int8QuantizeNNPI( ["Y_fp16"], ["Y"], Y_scale=Y_scale, Y_zero_point=Y_zero_point ) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net1) Y_fbgemm = workspace.FetchInt8Blob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" ref_net.Proto().op[3].type = "Swish" ref_net.Proto().op[4].type = "Int8Quantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b"], ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) # TODO: add an assertion to check the optimized net # fused Dequantize->Swish->Quantize to QuantizedSwish workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchInt8Blob("Y") U_int8 = workspace.FetchInt8Blob("U_int8") diff_Y = np.abs(Y_glow.data - Y_fbgemm.data) num_mismatches = np.count_nonzero(diff_Y) max_diff = np.max(diff_Y) if max_diff > 0 or Y_glow.scale != Y_fbgemm.scale or \ Y_glow.zero_point != Y_fbgemm.zero_point: print_test_debug_info( "QuantizedSwish", { "X": X_fp32, "X_scale": X_scale, "X_zero_point": X_zero_point, "Y_scale": Y_scale, "Y_zero_point": Y_zero_point, "U_int8": U_int8, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "max_diff": max_diff, "num_mismatches": num_mismatches, }, ) assert 0
def _run_zero_even_op(self, X): op = core.CreateOperator('ZeroEven', ['X'], ['Y']) workspace.FeedBlob('X', X) workspace.RunOperatorOnce(op) Y = workspace.FetchBlob('Y') return Y
def test_int8_quantize(self, n, rand_seed, non_zero_offset): print("n={}, rand_seed={}".format(n, rand_seed)) np.random.seed(rand_seed) workspace.ResetWorkspace() if non_zero_offset: X_fp32 = np.random.uniform(-1, 1, size=(n, n)).astype(np.float16) \ .astype(np.float32) else: X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32) W_fp32 = np.identity(n, dtype=np.float32) b_fp32 = np.zeros((n, ), dtype=np.float32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W"], ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, )) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI(["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["Y_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net.Int8DequantizeNNPI(["Y_int8"], ["Y"]) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchBlob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b"], ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_fbgemm): diff_Y = np.abs(Y_glow - Y_fbgemm) print_test_debug_info( "int8_fc", { "seed": rand_seed, "n": n, "X": X_fp32, "W": W_fp32, "b": b_fp32, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "maxdiff": diff_Y.max(axis=1), }, ) assert 0
def test_int8_fc(self, n, m, k, rand_seed, quantize_bias, f): print( f"n={n}, m={m}, k={k}, rand_seed={rand_seed}, quantize_bias={quantize_bias}" ) np.random.seed(rand_seed) workspace.ResetWorkspace() ff = float(f) X_fp32 = np.random.uniform(-ff, ff, size=(m, k)).astype(np.float32) W_fp32 = np.random.uniform(-ff, ff, size=(n, k)).astype(np.float32) b_fp32 = np.random.uniform(-ff, ff, size=(n)).astype(np.float32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) Y_fp32 = np.dot(X_fp32, W_fp32.T) + b_fp32 Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W", "b"] if quantize_bias else ["W"], ["W_int8", "b_int32"] if quantize_bias else ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, )) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI(["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b_int32" if quantize_bias else "b"], ["Y_int8"], Y_scale=Y_scale, Y_zero_point=Y_zero_point, ) ref_net.Int8DequantizeNNPI(["Y_int8"], ["Y"]) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchBlob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b_int32"] if quantize_bias else ["W_int8", "b"], ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_fbgemm): diff_Y = np.abs(Y_glow - Y_fbgemm) print_test_debug_info( "int8_fc", { "seed": rand_seed, "n": n, "m": m, "k": k, "X": X_fp32, "W": W_fp32, "b": b_fp32, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "maxdiff": diff_Y.max(axis=1), }, ) assert 0
def create_queue(queue_name, num_blobs, capacity): workspace.RunOperatorOnce( core.CreateOperator("CreateBlobsQueue", [], [queue_name], num_blobs=1, capacity=capacity)) return core.ScopedBlobReference(queue_name)
def _test_index_ops(self, entries, dtype, index_create_op): workspace.RunOperatorOnce( core.CreateOperator(index_create_op, [], ['index'], max_elements=10)) my_entries = np.array([entries[0], entries[1], entries[2]], dtype=dtype) workspace.FeedBlob('entries', my_entries) workspace.RunOperatorOnce( core.CreateOperator('IndexLoad', ['index', 'entries'], ['index'])) query1 = np.array([entries[0], entries[3], entries[0], entries[4]], dtype=dtype) workspace.FeedBlob('query1', query1) workspace.RunOperatorOnce( core.CreateOperator('IndexGet', ['index', 'query1'], ['result1'])) result1 = workspace.FetchBlob('result1') np.testing.assert_array_equal([1, 4, 1, 5], result1) workspace.RunOperatorOnce( core.CreateOperator('IndexFreeze', ['index'], ['index'])) query2 = np.array( [entries[5], entries[4], entries[0], entries[6], entries[7]], dtype=dtype) workspace.FeedBlob('query2', query2) workspace.RunOperatorOnce( core.CreateOperator('IndexGet', ['index', 'query2'], ['result2'])) result2 = workspace.FetchBlob('result2') np.testing.assert_array_equal([0, 5, 1, 0, 0], result2) workspace.RunOperatorOnce( core.CreateOperator('IndexSize', ['index'], ['index_size'])) size = workspace.FetchBlob('index_size') self.assertEquals(size, 6) workspace.RunOperatorOnce( core.CreateOperator('IndexStore', ['index'], ['stored_entries'])) stored_actual = workspace.FetchBlob('stored_entries') new_entries = np.array([entries[3], entries[4]], dtype=dtype) np.testing.assert_array_equal( np.concatenate((my_entries, new_entries)), stored_actual) workspace.RunOperatorOnce( core.CreateOperator(index_create_op, [], ['index2'])) workspace.RunOperatorOnce( core.CreateOperator('IndexLoad', ['index2', 'stored_entries'], ['index2'], skip_first_entry=1)) workspace.RunOperatorOnce( core.CreateOperator('IndexSize', ['index2'], ['index2_size'])) index2_size = workspace.FetchBlob('index2_size') self.assertEquals(index2_size, 5) # test serde with tempfile.NamedTemporaryFile() as tmp: workspace.RunOperatorOnce( core.CreateOperator('Save', ['index'], [], absolute_path=1, db_type='minidb', db=tmp.name)) # frees up the blob workspace.FeedBlob('index', np.array([])) # reloads the index workspace.RunOperatorOnce( core.CreateOperator('Load', [], ['index'], absolute_path=1, db_type='minidb', db=tmp.name)) query3 = np.array( [entries[0], entries[3], entries[0], entries[4], entries[4]], dtype=dtype) workspace.FeedBlob('query3', query3) workspace.RunOperatorOnce( core.CreateOperator('IndexGet', ['index', 'query3'], ['result3'])) result3 = workspace.FetchBlob('result3') np.testing.assert_array_equal([1, 4, 1, 5, 5], result3)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet152", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id interfaces = args.distributed_interfaces.split(",") if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: store_handler = "store_handler" if args.redis_host is not None: workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None def create_resnet152_model_ops(model, loss_scale): initializer = (pFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnet152( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: opt = optimizer.build_fp16_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet152_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, ) if args.model_parallel: activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet152_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet152_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) data_parallel_model.FinalizeAfterCheckpoint(train_model) last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet152_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # final save SaveModel(workspace, train_model)
def testEnforce(self): op = core.CreateOperator("Relu", ["X"], ["Y"]) with self.assertRaises(RuntimeError): workspace.RunOperatorOnce(op)
def test_sum_reduce(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(4, 5).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=0) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(2, 3).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=3) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res, decimal=3) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(3, 4).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 500).astype(np.float64) Y = np.random.rand(1).astype(np.float64) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.array(np.sum(X)) np.testing.assert_array_almost_equal(out, res, decimal=0) self.assertDeviceChecks(dc, op, [X, Y], [0])
def test_lambda_rank_loss(self, n, k, m): y = np.random.rand(n * m).astype(np.float32) r = np.random.randint(k, size=n * m).astype(np.float32) # m sessions of length n session_lengths = np.repeat(n, m).astype(np.int32) ref_loss = np.empty(0) ref_ndcg_loss = np.empty(0) ref_ndcg_loss_no_exp = np.empty(0) ref_dcg_loss = np.empty(0) ref_dcg_loss_no_exp = np.empty(0) ref_dy = np.empty(0) ref_dy_no_exp = np.empty(0) ref_dcg_dy = np.empty(0) ref_dcg_dy_no_exp = np.empty(0) for i in range(m): r_loss, r_dy = self.ref_lambda_rank_loss(y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], False, True, False) r_ndcg_loss, _ = self.ref_lambda_rank_loss(y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True, True, True) r_ndcg_loss_no_exp, r_dy_no_exp = self.ref_lambda_rank_loss( y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True, True, False) r_dcg_loss, r_dcg_dy = self.ref_lambda_rank_loss( y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True, False, True) r_dcg_loss_no_exp, r_dcg_dy_no_exp = self.ref_lambda_rank_loss( y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True, False, False) ref_loss = np.append(ref_loss, r_loss) ref_dy = np.append(ref_dy, r_dy) ref_ndcg_loss = np.append(ref_ndcg_loss, r_ndcg_loss) ref_ndcg_loss_no_exp = np.append(ref_ndcg_loss_no_exp, r_ndcg_loss_no_exp) ref_dy_no_exp = np.append(ref_dy_no_exp, r_dy_no_exp) ref_dcg_loss = np.append(ref_dcg_loss, r_dcg_loss) ref_dcg_dy = np.append(ref_dcg_dy, r_dcg_dy) ref_dcg_loss_no_exp = np.append(ref_dcg_loss_no_exp, r_dcg_loss_no_exp) ref_dcg_dy_no_exp = np.append(ref_dcg_dy_no_exp, r_dcg_dy_no_exp) dloss = np.random.random(m).astype(np.float32) workspace.blobs["y"] = y workspace.blobs["r"] = r workspace.blobs["session_lengths"] = session_lengths workspace.blobs["dloss"] = dloss op = core.CreateOperator( "LambdaRankNdcg", ["y", "r", "session_lengths"], ["loss", "dy"], use_ndcg_as_loss=False, use_idcg_normalization=True, use_exp_gain=False, ) workspace.RunOperatorOnce(op) loss = workspace.blobs["loss"] dy = workspace.blobs["dy"] np.testing.assert_allclose(loss, ref_loss, rtol=1e-5, atol=1e-6) np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6) op = core.CreateOperator( "LambdaRankNdcg", ["y", "r", "session_lengths"], ["loss", "dy"], use_ndcg_as_loss=True, use_idcg_normalization=True, use_exp_gain=True, ) workspace.RunOperatorOnce(op) loss = workspace.blobs["loss"] dy = workspace.blobs["dy"] np.testing.assert_allclose(loss, ref_ndcg_loss, rtol=1e-5, atol=1e-6) np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6) op = core.CreateOperator( "LambdaRankNdcgGradient", ["y", "session_lengths", "dy", "dloss"], ["dy_back"], ) workspace.RunOperatorOnce(op) dy_back = workspace.blobs["dy_back"] for i in range(m): np.testing.assert_allclose( dy_back[i * n:(i + 1) * n], dloss[i] * ref_dy[i * n:(i + 1) * n], rtol=1e-5, atol=1e-6, ) op = core.CreateOperator( "LambdaRankNdcg", ["y", "r", "session_lengths"], ["loss", "dy"], use_ndcg_as_loss=True, use_idcg_normalization=True, use_exp_gain=False, ) workspace.RunOperatorOnce(op) loss = workspace.blobs["loss"] dy = workspace.blobs["dy"] np.testing.assert_allclose(loss, ref_ndcg_loss_no_exp, rtol=1e-5, atol=1e-6) np.testing.assert_allclose(dy, ref_dy_no_exp, rtol=1e-5, atol=1e-6) op = core.CreateOperator( "LambdaRankNdcgGradient", ["y", "session_lengths", "dy", "dloss"], ["dy_back"], ) workspace.RunOperatorOnce(op) dy_back = workspace.blobs["dy_back"] for i in range(m): np.testing.assert_allclose( dy_back[i * n:(i + 1) * n], dloss[i] * ref_dy_no_exp[i * n:(i + 1) * n], rtol=1e-5, atol=1e-6, ) op = core.CreateOperator( "LambdaRankNdcg", ["y", "r", "session_lengths"], ["loss", "dy"], use_ndcg_as_loss=True, use_idcg_normalization=False, use_exp_gain=True, ) workspace.RunOperatorOnce(op) loss = workspace.blobs["loss"] dy = workspace.blobs["dy"] np.testing.assert_allclose(loss, ref_dcg_loss, rtol=1e-5, atol=1e-6) np.testing.assert_allclose(dy, ref_dcg_dy, rtol=1e-5, atol=1e-6) op = core.CreateOperator( "LambdaRankNdcgGradient", ["y", "session_lengths", "dy", "dloss"], ["dy_back"], ) workspace.RunOperatorOnce(op) dy_back = workspace.blobs["dy_back"] for i in range(m): np.testing.assert_allclose( dy_back[i * n:(i + 1) * n], dloss[i] * ref_dcg_dy[i * n:(i + 1) * n], rtol=1e-5, atol=1e-6, ) op = core.CreateOperator( "LambdaRankNdcg", ["y", "r", "session_lengths"], ["loss", "dy"], use_ndcg_as_loss=True, use_idcg_normalization=False, use_exp_gain=False, ) workspace.RunOperatorOnce(op) loss = workspace.blobs["loss"] dy = workspace.blobs["dy"] np.testing.assert_allclose(loss, ref_dcg_loss_no_exp, rtol=1e-5, atol=1e-6) np.testing.assert_allclose(dy, ref_dcg_dy_no_exp, rtol=1e-5, atol=1e-6) op = core.CreateOperator( "LambdaRankNdcgGradient", ["y", "session_lengths", "dy", "dloss"], ["dy_back"], ) workspace.RunOperatorOnce(op) dy_back = workspace.blobs["dy_back"] for i in range(m): np.testing.assert_allclose( dy_back[i * n:(i + 1) * n], dloss[i] * ref_dcg_dy_no_exp[i * n:(i + 1) * n], rtol=1e-5, atol=1e-6, )
def test_int8_elementwise_sum(self, size, input_channels, batch_size, inputs, inplace, gc, dc): sum_fp32 = core.CreateOperator( "Sum", ["X_{}".format(i) for i in range(inputs)], ["X_0" if inplace else "Y"], ) Xs = [np.random.rand(batch_size, input_channels, size, size).astype( np.float32) for _ in range(inputs)] old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) Xi_scales = [] Xi_zero_points = [] for i, X in enumerate(Xs): workspace.FeedBlob("X_{}".format(i), X, dc[0]) if X.min() >= 0: Xi_scales.append(np.absolute(X).max() / 0xFF) Xi_zero_points.append(0) else: Xi_scales.append(np.absolute(X).max() / 0x7F) Xi_zero_points.append(128) workspace.RunOperatorOnce(sum_fp32) Y = workspace.FetchBlob("X_0" if inplace else "Y") if Y.min() >= 0: Y_scale = np.absolute(Y).max() / 0xFF Y_zero_point = 0 else: Y_scale = np.absolute(Y).max() / 0x7F Y_zero_point = 128 workspace.ResetWorkspace() net = caffe2_pb2.NetDef() for i, Xi in enumerate(Xs): workspace.FeedBlob("Xi_{}".format(i), Xi, dc[1]) sw2nhwc = core.CreateOperator( "NCHW2NHWC", ["Xi_{}".format(i)], ["Xi_{}_nhwc".format(i)], device_option=dc[1] ) quantize = core.CreateOperator( "Int8Quantize", ["Xi_{}_nhwc".format(i)], ["Xi_{}_quantized".format(i)], engine="DNNLOWP", device_option=dc[1], Y_zero_point=Xi_zero_points[i], Y_scale=Xi_scales[i], ) net.op.extend([sw2nhwc, quantize]) sum = core.CreateOperator( "Int8Sum", ["Xi_{}_quantized".format(i) for i in range(inputs)], ["Xi_0_quantized" if inplace else "Y_quantized"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=Y_zero_point, Y_scale=Y_scale, ) dequantize = core.CreateOperator( "Int8Dequantize", ["Xi_0_quantized" if inplace else "Y_quantized"], ["Y_nhwc"], engine="DNNLOWP", device_option=dc[1], ) sw2nchw = core.CreateOperator( "NHWC2NCHW", ["Y_nhwc"], ["Y_out"], device_option=dc[1] ) net.op.extend([sum, dequantize, sw2nchw]) workspace.RunNetOnce(net) Y_out = workspace.FetchBlob("Y_out") MSE = np.square(np.subtract(Y, Y_out)).mean() if MSE > 0.005: print(Y.flatten()) print(Y_out.flatten()) print(np.max(np.abs(Y_out - Y))) print("MSE", MSE) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create CNNModeLhelper object train_model = cnn.CNNModelHelper( order="NCHW", name="resnet50", use_cudnn=True, cudnn_exhaustive_search=True, ws_nbytes_limit=(args.cudnn_workspace_limit_mb * 1024 * 1024), ) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): [softmax, loss] = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label", no_bias=True, ) loss = model.Scale(loss, scale=loss_scale) model.Accuracy([softmax, "label"], "accuracy") return [loss] # SGD def add_parameter_update_ops(model): model.AddWeightDecay(args.weight_decay) ITER = model.Iter("ITER") stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) LR = model.net.LearningRate( [ITER], "LR", base_lr=float(job.get_parameter('base_learning_rate')), policy="step", stepsize=stepsz, gamma=0.1, ) AddMomentumParameterUpdate(model, LR) # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, ) # Create parallelized model data_parallel_model.Parallelize_GPU( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=add_parameter_update_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, ) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_model = cnn.CNNModelHelper(order="NCHW", name="resnet50_test", use_cudnn=True, cudnn_exhaustive_search=True) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time epoch = 0 while epoch < args.num_epochs: epoch, test_accuracy = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # send metric accuracy_channel.send(epoch, test_accuracy) job.progress(epoch, total=args.num_epochs)
def test_int8_relu(self, size, input_channels, batch_size, inplace, gc, dc): relu_fp32 = core.CreateOperator( "Relu", ["X"], ["Y"] if not inplace else ["X"], device_option=dc[0] ) X = np.random.rand( batch_size, input_channels, size, size).astype(np.float32) - 0.5 # go away from the origin point to avoid kink problems X += 0.02 * np.sign(X) X[X == 0.0] += 0.02 if X.min() >=0: scale = np.absolute(X).max() / 0xFF zero_point = 0 else: scale = np.absolute(X).max() / 0x7F zero_point = 128 old_ws_name = workspace.CurrentWorkspace() workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob("X", X, dc[0]) workspace.RunOperatorOnce(relu_fp32) Y = workspace.FetchBlob("X" if inplace else "Y") workspace.ResetWorkspace() sw2nhwc = core.CreateOperator( "NCHW2NHWC", ["Xi"], ["Xi_nhwc"], device_option=dc[1] ) quantize = core.CreateOperator( "Int8Quantize", ["Xi_nhwc"], ["Xi_quantized"], engine="DNNLOWP", device_option=dc[1], Y_zero_point=zero_point, Y_scale=scale, ) relu = core.CreateOperator( "Int8Relu", ["Xi_quantized"], ["Y_quantized"] if not inplace else ["Xi_quantized"], engine="DNNLOWP", device_option=dc[1], ) dequantize = core.CreateOperator( "Int8Dequantize", ["Y_quantized"] if not inplace else ["Xi_quantized"], ["Y_nhwc"], engine="DNNLOWP", device_option=dc[1], ) sw2nchw = core.CreateOperator( "NHWC2NCHW", ["Y_nhwc"], ["Y_out"], device_option=dc[1] ) net = caffe2_pb2.NetDef() net.op.extend([sw2nhwc, quantize, relu, dequantize, sw2nchw]) workspace.FeedBlob("Xi", X, dc[1]) workspace.RunNetOnce(net) Y_out = workspace.FetchBlob("Y_out") MSE = np.square(np.subtract(Y, Y_out)).mean() if MSE > 0.005: print(Y.flatten()) print(Y_out.flatten()) print(np.max(np.abs(Y_out - Y))) print("MSE", MSE) self.assertTrue(False) workspace.SwitchWorkspace(old_ws_name)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustice_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): [softmax, loss] = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label", no_bias=True, ) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) optimizer.build_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, ) # Create parallelized model data_parallel_model.Parallelize_GPU( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, optimizer_builder_fun=add_optimizer, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, ) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def test_fc_num0(self, seed, m, k, n, use_packed): """ Test numerics, fix a dimension and determine the ranges of error. Use Fp16FCAcc16 as a reference. """ W = "W_packed" if use_packed else "W0" dtype = np.float32 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", W, "b0"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "FbFCPacked" if use_packed else "FC", ["X", W, "b0"], ["Y"], ) ) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend(["X", W, "b0"]) pred_net_ref.external_output.append("Y") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "Fp16FCAcc32NNPI", ["X", W, "b0"], ["Y"], ) ) workspace.SwitchWorkspace("glow_test_ws", True) workspace.ResetWorkspace() W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype(np.float32) b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype(np.float32) workspace.FeedBlob("W0", W0) workspace.FeedBlob("b0", b0) workspace.RunOperatorOnce( core.CreateOperator( "FbGemmPack", ['W0'], ['W_packed'], no_packing=True, ) ) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {"X": (m, k)}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) X0 = np.random.rand(m, k).astype(dtype) - 0.5 workspace.FeedBlob("X", X0) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob('Y') diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8)) rowdiff = np.max(diff, axis=1) n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) if n_offenders > 0: print_test_debug_info("fc", { "seed": seed, "use_packed": use_packed, "m": m, "k": k, "n": n, "X": X0.shape, "W0": W0.shape, "b0": b0.shape, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff, "rowdiff": rowdiff}) assert(0)
def test_exception(self): op = CreatePythonOperator(MainOpFunctionThatThrowsRuntimeError, [], []) with self.assertRaises(RuntimeError): workspace.RunOperatorOnce(op)
def _lengths_ref(X, Y): ref_op = core.CreateOperator(ref_op_name, ["X", "Y"], "out") workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(ref_op) return workspace.FetchBlob("out")
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="ban-pc-resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model configs for constructing model with open(args.model_config) as f: model_config = yaml.load(f) # Model building functions def create_target_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = add_se_model(model, model_config, "data", is_test=False) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') loss = add_pc_loss(model, model_config, pred, 'label') brew.accuracy(model, ['softmax', 'label'], 'accuracy') return [loss] def add_optimizer(model): ''' stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) ''' optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, base_learning_rate=args.base_learning_rate, momentum=model_config['solver']['momentum'], nesterov=model_config['solver']['nesterov'], policy=model_config['solver']['lr_policy'], power=model_config['solver']['power'], max_iter=model_config['solver']['max_iter'], ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_target_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) if args.model_parallel: # Shift half of the activations to another GPU assert workspace.NumCudaDevices() >= 2 * args.num_gpus activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="ban-pc-resnet50_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_target_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "log/{}/resnet50_gpu{}_b{}_L{}_lr{:.2f}_v2".format( args.dataset_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Load pretrained param_init_net load_init_net_multigpu(args) # Run the training one epoch a time best_accuracy = 0 while epoch < args.num_epochs: epoch, best_accuracy = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, best_accuracy, ) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def test_fc_with_axis(self, n, m, c, h, w, axis, gc, dc): X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5 k = reduce((lambda x, y: x * y), [n, c, h, w][axis - 4:]) nn = reduce((lambda x, y: x * y), [n, c, h, w][:axis]) W = np.random.rand(m, k).astype(np.float32) - 0.5 b = np.random.rand(m).astype(np.float32) - 0.5 dY = np.random.rand(nn, m).astype(np.float32) - 0.5 op0 = core.CreateOperator('FC', ['X', 'W', 'b'], ["Y"], axis=axis, device_option=dc[0]) op0_bw = core.CreateOperator('FCGradient', ['X', 'W', 'dY'], ["dW", "db"], axis=axis, device_option=dc[0]) workspace.ResetWorkspace() workspace.FeedBlob('X', X, dc[0]) workspace.FeedBlob('W', W, dc[0]) workspace.FeedBlob('b', b, dc[0]) workspace.RunOperatorOnce(op0) Y0 = workspace.FetchBlob('Y') workspace.FeedBlob('dY', dY, dc[0]) workspace.RunOperatorOnce(op0_bw) dW0 = workspace.FetchBlob('dW') db0 = workspace.FetchBlob('db') op1 = core.CreateOperator('FC', ['X', 'W', 'b'], ["Y"], axis=axis, device_option=dc[1]) op1_bw = core.CreateOperator('FCGradient', ['X', 'W', 'dY'], ["dW", "db"], axis=axis, device_option=dc[1]) workspace.SwitchWorkspace("_device_check_", True) workspace.FeedBlob('X', X, dc[1]) workspace.FeedBlob('W', W, dc[1]) workspace.FeedBlob('b', b, dc[1]) workspace.RunOperatorOnce(op1) Y1 = workspace.FetchBlob('Y') workspace.FeedBlob('dY', dY, dc[1]) workspace.RunOperatorOnce(op1_bw) dW1 = workspace.FetchBlob('dW') db1 = workspace.FetchBlob('db') Y0 = Y0.flatten() Y1 = Y1.flatten() if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): print(Y1) print(Y0) print(np.max(np.abs(Y1 - Y0))) self.assertTrue(False) dW0 = dW0.flatten() dW1 = dW1.flatten() if not np.allclose(dW0, dW1, atol=0.01, rtol=0.01): print(dW1) print(dW0) print(np.max(np.abs(dW1 - dW0))) self.assertTrue(False) db0 = db0.flatten() db1 = db1.flatten() if not np.allclose(db0, db1, atol=0.01, rtol=0.01): print(db1) print(db0) print(np.max(np.abs(db1 - db0))) self.assertTrue(False)
def test_merge_multi_map_feature_tensors(self): op = core.CreateOperator("MergeMultiMapFeatureTensors", [ "in1_lengths", "in1_keys", "in1_values_lengths", "in1_values_keys", "in1_values_values", "in2_lengths", "in2_keys", "in2_values_lengths", "in2_values_keys", "in2_values_values", ], [ "out_lengths", "out_keys", "out_values_lengths", "out_values_keys", "out_values_values" ]) # Input 1. workspace.FeedBlob("in1_lengths", np.array([1, 2], dtype=np.int32)) workspace.FeedBlob("in1_keys", np.array([11, 12, 13], dtype=np.int64)) workspace.FeedBlob("in1_values_lengths", np.array([2, 2, 2], dtype=np.int32)) workspace.FeedBlob( "in1_values_keys", np.array([111, 112, 121, 122, 131, 132], dtype=np.int64)) workspace.FeedBlob( "in1_values_values", np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float)) # Input 2. workspace.FeedBlob("in2_lengths", np.array([2, 1], dtype=np.int32)) workspace.FeedBlob("in2_keys", np.array([14, 15, 16], dtype=np.int64)) workspace.FeedBlob("in2_values_lengths", np.array([2, 2, 2], dtype=np.int32)) workspace.FeedBlob( "in2_values_keys", np.array([141, 142, 151, 152, 161, 162], dtype=np.int64)) workspace.FeedBlob( "in2_values_values", np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float)) workspace.RunOperatorOnce(op) np.testing.assert_array_equal(workspace.FetchBlob("out_lengths"), np.array([3, 3], dtype=np.int32)) np.testing.assert_array_equal( workspace.FetchBlob("out_keys"), np.array([11, 14, 15, 12, 13, 16], dtype=np.int64)) np.testing.assert_array_equal( workspace.FetchBlob("out_values_lengths"), np.array([2, 2, 2, 2, 2, 2], dtype=np.int32)) np.testing.assert_array_equal( workspace.FetchBlob("out_values_keys"), np.array( [111, 112, 141, 142, 151, 152, 121, 122, 131, 132, 161, 162], dtype=np.int64)) np.testing.assert_array_equal( workspace.FetchBlob("out_values_values"), np.array([ 11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2, 16.1, 16.2 ], dtype=np.float))