def inc_current_step(self): workspace.FeedBlob( self.global_step, np.array([self.get_current_step() + 1]), )
def feed(b, v): if ws is None: workspace.FeedBlob(str(b), v) else: ws.create_blob(str(b)) ws.blobs[str(b)].feed(v)
def preprocess_samples(self, samples: Samples, minibatch_size: int) -> List[TrainingDataPage]: samples.shuffle() net = core.Net("gridworld_preprocessing") C2.set_net(net) preprocessor = PreprocessorNet(True) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "state_norm", False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "next_state_norm", False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.actions, "action") action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "action_norm", False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_actions, "next_action") next_action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "next_action_norm", False, False, ) propensities = np.array(samples.propensities, dtype=np.float32).reshape(-1, 1) rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1) pnas_lengths_list = [] pnas_flat: List[List[str]] = [] for pnas in samples.possible_next_actions: pnas_lengths_list.append(len(pnas)) pnas_flat.extend(pnas) saa = StackedAssociativeArray.from_dict_list(pnas_flat, "possible_next_actions") pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32) pna_lens_blob = "pna_lens_blob" workspace.FeedBlob(pna_lens_blob, pnas_lengths) possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "possible_next_action_norm", False, False, ) state_pnas_blob = preprocessor.concat_states_and_possible_actions( next_state_matrix, possible_next_actions_matrix, pna_lens_blob) workspace.RunNetOnce(net) states_ndarray = workspace.FetchBlob(state_matrix) actions_ndarray = workspace.FetchBlob(action_matrix) next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_actions_ndarray = workspace.FetchBlob(next_action_matrix) possible_next_actions_ndarray = workspace.FetchBlob( possible_next_actions_matrix) next_state_pnas_concat = workspace.FetchBlob(state_pnas_blob) time_diffs = np.ones(len(states_ndarray)) episode_values = None if samples.reward_timelines is not None: episode_values = np.zeros(rewards.shape, dtype=np.float32) for i, reward_timeline in enumerate(samples.reward_timelines): for time_diff, reward in reward_timeline.items(): episode_values[i, 0] += reward * (DISCOUNT**time_diff) tdps = [] pnas_start = 0 for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + np.sum(pnas_lengths[start:end]) pnas = possible_next_actions_ndarray[pnas_start:pnas_end] pnas_start = pnas_end tdps.append( TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], propensities=propensities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], possible_next_actions=StackedArray(pnas_lengths[start:end], pnas), not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1), episode_values=episode_values[start:end] if episode_values is not None else None, time_diffs=time_diffs[start:end], possible_next_actions_lengths=pnas_lengths[start:end], next_state_pnas_concat=next_state_pnas_concat, )) return tdps
def testShapeInferenceSoftmaxWithLoss(self): model = cnn.CNNModelHelper() model.SoftmaxWithLoss( ["logits", "labels"], ["softmax", "loss"], ) # 2D Shape of [batch_size, num_classes] workspace.FeedBlob( "logits", np.random.rand(4, 3).astype(np.float32), ) # Shape of size batch_size with all values [0, num_classes) workspace.FeedBlob( "labels", np.random.randint(low=0, high=3, size=(4, 1)).astype(np.int32), ) self.InferTensorRunAndCompare(model) # Testing with 1D labels arg workspace.FeedBlob( "logits", np.random.rand(4, 3).astype(np.float32), ) workspace.FeedBlob( "labels", np.random.randint(low=0, high=3, size=4).astype(np.int32), ) self.InferTensorRunAndCompare(model) # Testing with weight_tensor model.SoftmaxWithLoss( ["logits", "labels", "weight_tensor"], ["softmax", "loss"], ) workspace.FeedBlob( "logits", np.random.rand(4, 3).astype(np.float32), ) workspace.FeedBlob( "labels", np.random.randint(low=0, high=3, size=4).astype(np.int32), ) workspace.FeedBlob( "weight_tensor", np.random.rand(4).astype(np.float32), ) self.InferTensorRunAndCompare(model) # Test spatial model model = cnn.CNNModelHelper() workspace.FeedBlob("img", np.random.rand(32, 19, 33, 28).astype(np.float32)) workspace.FeedBlob("img_labels", (np.random.rand(32, 33, 28) * 19).astype(np.int32)) model.SoftmaxWithLoss(["img", "img_labels"], ["softmax_img", "loss"], spatial=1) self.InferTensorRunAndCompare(model)
def test_bn(self, seed, size, input_channels, batch_size): workspace.ResetWorkspace() np.random.seed(seed) order = "NCHW" epsilon = 1e-3 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator("SpatialBN", ["X", "scale", "bias", "mean", "var"], ["Y"], order=order, is_test=True, epsilon=epsilon)) if GLOW_LOWERED_BATCHNORM: refopname = "SpatialBNFakeLoweredFp16NNPI" else: refopname = "SpatialBNFakeFp16NNPI" pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend( ["X", "scale", "bias", "mean", "var"]) pred_net_ref.external_output.append("X") pred_net_ref.op.add().CopyFrom( core.CreateOperator(refopname, ["X", "scale", "bias", "mean", "var"], ["Y"], order=order, is_test=True, epsilon=epsilon)) scale = np.random.rand(input_channels).astype(np.float32) + 0.5 bias = np.random.rand(input_channels).astype(np.float32) - 0.5 mean = np.random.randn(input_channels).astype(np.float32) var = np.random.rand(input_channels).astype(np.float32) + 0.5 X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 workspace.FeedBlob("scale", scale) workspace.FeedBlob("bias", bias) workspace.FeedBlob("mean", mean) workspace.FeedBlob("var", var) # Use for reference to debug # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order) pred_net_onnxified = onnxifi_caffe2_net(pred_net, { "X": [batch_size, input_channels, size, size], "scale": [input_channels], "bias": [input_channels], "mean": [input_channels], "var": [input_channels] }, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff = np.abs(Y_glow - Y_c2).astype(np.float16) print_test_debug_info( "bn", { "seed": seed, "scale": scale, "bias": bias, "mean": mean, "var": var, "Y_np": Y_c2, "Y_glow": Y_glow, "diff": diff, "rowwise_diff": np.max(np.abs(diff), -1) }) assert (0)
def test_sum_reduce(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(4, 5).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=0) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(2, 3).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=3) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res, decimal=3) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(3, 4).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=2) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 500).astype(np.float64) Y = np.random.rand(1).astype(np.float64) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.array(np.sum(X)) np.testing.assert_array_almost_equal(out, res, decimal=0) # broadcasting with single elem dimensions at both ends X = np.random.rand(2, 3, 4, 5).astype(np.float32) Y = np.random.rand(1, 3, 4, 1).astype(np.float32) op = core.CreateOperator("SumReduceLike", ["X", "Y"], "out", broadcast=1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.RunOperatorOnce(op) out = workspace.FetchBlob("out") res = np.sum(X, axis=0) res = np.sum(res, axis=2).reshape(Y.shape) np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) # fp64 is not supported with the CUDA/HIP op dc_cpu_only = [ d for d in dc if (d.device_type != caffe2_pb2.CUDA or d.device_type != caffe2_pb2.HIP) ] self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
def test_gradient_optim(self, input_dim, output_dim, batch_size): m = cnn.CNNModelHelper() with core.NameScope("name_x"): fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5)\ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") input_to_grad = m.AddGradientOperators(["name_x/loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", share_activations=False, ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) optim_proto_wacts = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", share_activations=True, ) blobs_wact_optim = count_blobs(optim_proto_wacts) self.assertLessEqual(blobs_wact_optim, blobs_after) # Check that the last activations are not shared self.assertTrue(has_blob(optim_proto, "name_x/fc5")) self.assertTrue( has_blob(optim_proto_wacts, "name_x/fc5"), "Dont remap final activation", ) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss = workspace.FetchBlob("name_x/loss") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad) # Run with the forward optimization workspace.RunNetOnce(optim_proto_wacts) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad)
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather( [self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [ param_grad.values, param_momentum, param ], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill( [], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16 ) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob( self.vecs, orig_vecs ) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption(caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse( np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w")]
def run_model(self, V, gpu_devices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): gpu_vecs_gathered = [] gpu_vecs = [] for num, vec in enumerate(self.vecs): gpu_vec = model.param_init_net.CopyCPUToGPU( vec, 'gpuvec_{}'.format(num), ) if num != 2: model.params.append(gpu_vec) gpu_vecs.append(gpu_vec) for num, gpu_vec in enumerate(gpu_vecs): gpu_vec_gathered = model.net.Gather( [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)] ) gpu_vecs_gathered.append(gpu_vec_gathered) assert len(gpu_vecs_gathered) == 3 fc = model.net.FC( [ gpu_vecs_gathered[2], gpu_vecs_gathered[0], gpu_vecs_gathered[1], ], ['fc'], ) _, loss = model.net.SoftmaxWithLoss( [fc, 'label'], ['ce_loss', 'avg_loss'], only_loss=True, ) loss = model.Scale(loss, scale=loss_scale) model.net.Print(loss, [], limit=10) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad.values, ONE, ], param, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) batch_size = 32 batch_per_device = batch_size // len(gpu_devices) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) ''' self.vecs consists of 3 big blobs on which we call Gather: 1) FC weights, shape=(V, 16) 2) FC bias, shape=(V) 3) FC input, shape=(batch_per_device, 16) ''' self.vecs = [ model.param_init_net.UniformFill( [], "vec_{}".format(num), shape=[V, 16]) for num in range(2) ] self.vecs.append( model.param_init_net.UniformFill( [], "vec_2", shape=[batch_per_device, 16] ) ) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for num, vec in enumerate(self.vecs[:-1]): model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec) # Each run has same input, independent of number of gpus for i in range(0, 10): np.random.seed(2603) full_indices = np.random.permutation(V)[:batch_size].reshape( batch_size ) full_labels = full_indices[:] % batch_per_device for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en].astype(np.int32) labels = full_labels[st:en].astype(np.int32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/indices".format(g), indices) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = [ np.random.rand(V, 16).astype(np.float32), np.random.rand(V).astype(np.float32), np.random.rand(V, 16).astype(np.float32), ] for vec, orig_vec in zip(self.vecs, orig_vecs): workspace.FeedBlob( vec, orig_vec ) for g in gpu_devices: for num, orig_vec in enumerate(orig_vecs): workspace.FeedBlob( "gpu_{}/gpuvec_{}".format(g, num), orig_vec, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) idx = workspace.FetchBlob('gpu_0/indices') grad_slices = [ workspace.FetchBlob( 'gpu_{}/gpu_vec_gathered_{}_grad'.format(g, num)) for g in gpu_devices for num in range(2) ] for grad_slice in grad_slices: # print (len(idx), len(grad_slice)) assert len(idx) == len(grad_slice), ( 'Number of indices {} is not same as number of gradient ' 'slices {}. This might lead to illegal memory access'.format( len(idx), len(grad_slice) ) )
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32) ) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim] ) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim] ) output, _last_hidden, _, _last_state, = rnn_cell.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) assert len(model.GetParams()) == len(model.params) // len(model._devices) workspace.ResetWorkspace() model = cnn.CNNModelHelper( name="recurrent_test{}".format(devices), ) self.T = 8 self.batch_size = 64 self.input_dim = 8 self.hidden_dim = 31 self.batch_per_device = self.batch_size // len(devices) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=devices, optimize_gradient_memory=True, cpu_device=not gpu, ) # Change all initialization to be ConstantFills so that # the everything is deterministic for op in model.param_init_net.Proto().op: if op.type.endswith('Fill'): op.type = 'ConstantFill' # Each run has same input, independent of number of gpus np.random.seed(20150210) for i in range(0, 10): full_data = np.random.rand(self.T, self.batch_size, self.input_dim) full_target = np.random.rand( self.T, self.batch_size, self.hidden_dim ) for (j, g) in enumerate(devices): st = j * self.batch_per_device en = st + self.batch_per_device data = full_data[:, st:en, :].astype(np.float32) targets = full_target[:, st:en, :].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data ) workspace.FeedBlob( "{}_{}/target".format(model._device_prefix, g), targets ) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def add_optimizer(model): return optimizer.build_sgd( model, 0.1, policy="fixed", max_gradient_norm=5.0, allow_lr_injection=True, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, shared_model=not gpu, combine_spatial_bn=not gpu, ) data_parallel_model.AddBlobSync(model, ["sync_num"]) # Light test for LR names lr_names = data_parallel_model.GetLearningRateBlobNames(model) self.assertGreater(len(lr_names), 0) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data ) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels ) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.FeedBlob( model._device_prefix + "_0/sync_num", np.array([i * 2]).astype(np.float32), device_option=core.DeviceOption(model._device_type, 0)) workspace.RunNet(model.net.Proto().name) # Test AddBlobSync for j in model._devices: sync = workspace.FetchBlob( model._device_prefix + "_{}/sync_num".format(j))[0] self.assertTrue(abs(sync - i * 2) < 0.01) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def test_prepare_normalization_and_normalize(self): features, feature_value_map = preprocessing_util.read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, 10) for k, v in normalization_parameters.items(): if k == CONTINUOUS: self.assertEqual(v.feature_type, CONTINUOUS) self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif k == BOXCOX: self.assertEqual(v.feature_type, BOXCOX) self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: assert v.feature_type == k or v.feature_type + "_2" + k norm_net = core.Net("net") C2.set_net(norm_net) preprocessor = PreprocessorNet(norm_net, False) input_matrix = np.zeros([10000, len(features)], dtype=np.float32) for i, feature in enumerate(features): input_matrix[:, i] = feature_value_map[feature] input_matrix_blob = 'input_matrix_blob' workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32)) output_blob, _ = preprocessor.normalize_dense_matrix( input_matrix_blob, features, normalization_parameters, '') workspace.FeedBlob(input_matrix_blob, input_matrix) workspace.RunNetOnce(norm_net) normalized_feature_matrix = workspace.FetchBlob(output_blob) normalized_features = {} on_column = 0 for feature in features: norm = normalization_parameters[feature] if norm.feature_type == ENUM: column_size = len(norm.possible_values) else: column_size = 1 normalized_features[feature] = \ normalized_feature_matrix[:, on_column:( on_column + column_size )] on_column += column_size self.assertTrue( all([ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ])) for k, v in six.iteritems(normalized_features): self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)))) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] self.assertEqual(possible_value_map[original_feature], np.where(row == 1)[0][0]) elif feature_type == identify_types.QUANTILE: for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] expected = self._value_to_quantile( original_feature, normalization_parameters[k].quantiles) self.assertAlmostEqual(feature, expected, 2) elif feature_type == identify_types.BINARY: pass elif feature_type == identify_types.CONTINUOUS or \ feature_type == identify_types.BOXCOX: one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), 'mean of feature {} is {}, not 0'.format(k, np.mean(v))) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) else: raise NotImplementedError()
def CheckSimple(self, op, inputs, input_to_check, outputs_with_grads, grad_ops=None): """Checks the operator in a very simple fashion by stacking a sum of squares on the top. Inputs: op: the operator to be checked. inputs: the input data in numpy arrays. input_to_check: an index specifying which input blob we should check. outputs_with_grads: indices specifying which output blobs will we need to check gradients with. For these outputs, we will collect a squared sum and also feed in their gradients. grad_operator: the gradient operator. If not given, we will get the gradient operator from the gradient registry. Outputs: boolean: True if it passes, False if it does not pass. """ # Entering the checker workspace old_ws_name = workspace.CurrentWorkspace() if self._workspace_name != old_ws_name: workspace.SwitchWorkspace(self._workspace_name, True) op.device_option.CopyFrom(self._device_option) if grad_ops is None: grad_ops = core.GradientRegistry.GetGradientDefs(op) dims_to_check = inputs[input_to_check].size # First, feed in the input. for i, arr in enumerate(inputs): workspace.FeedBlob(op.input[i], arr, self._device_option) # Get the loss and gradient for the original. input_name = op.input[input_to_check] loss, grad = self.GetLossAndGrad(op, grad_ops, inputs[input_to_check], input_name, outputs_with_grads) grad_estimate = np.zeros_like(inputs[input_to_check]) for current_dim in range(dims_to_check): # Positive gradient inputs[input_to_check].flat[current_dim] += self._stepsize pos_loss, _ = self.GetLossAndGrad(op, grad_ops, inputs[input_to_check], input_name, outputs_with_grads) # Negative gradient inputs[input_to_check].flat[current_dim] -= self._stepsize * 2 neg_loss, _ = self.GetLossAndGrad(op, grad_ops, inputs[input_to_check], input_name, outputs_with_grads) # Recover the value inputs[input_to_check].flat[current_dim] += self._stepsize grad_estimate.flat[current_dim] = (pos_loss - neg_loss) / self._stepsize / 2 # Now, check correctness scale = np.maximum(np.maximum(np.abs(grad), np.abs(grad_estimate)), 1) fail_mat = (np.abs(grad - grad_estimate) > scale * self._threshold) if np.any(fail_mat): idx = np.flatnonzero(fail_mat) #print 'Failed. [idx, grad, grad_estimate] are:' #print np.vstack([idx, grad.flat[idx], grad_estimate.flat[idx]]).T ret = False else: ret = True # After finishing, cleaning up things. if self._workspace_name != old_ws_name: # We reset the workspace to make sure everything intermediate is cleaned # up. Note that there is no need to delete a workspace - when empty it # takes a very limited amount of memory. workspace.ResetWorkspace() workspace.SwitchWorkspace(old_ws_name) return ret, grad, grad_estimate
def assertReferenceChecks( self, device_option, op, inputs, reference, input_device_options=None, threshold=1e-4, output_to_grad=None, grad_reference=None, atol=None, outputs_to_check=None, ): """ This runs the reference Python function implementation (effectively calling `reference(*inputs)`, and compares that to the output of output, with an absolute/relative tolerance given by the `threshold` parameter. Useful for checking the implementation matches the Python (typically NumPy) implementation of the same functionality. Usage example: @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs) def test_softsign(self, X, inplace, gc, dc): op = core.CreateOperator( "Softsign", ["X"], ["X" if inplace else "Y"]) def softsign(X): return (X / (1 + np.abs(X)),) self.assertReferenceChecks(gc, op, [X], softsign) """ op = copy.deepcopy(op) op.device_option.CopyFrom(device_option) with temp_workspace(): if (len(op.input) > len(inputs)): raise ValueError( 'must supply an input for each input on the op: %s vs %s' % (op.input, inputs)) _input_device_options = input_device_options or \ core.InferOpBlobDevicesAsDict(op)[0] for (n, b) in zip(op.input, inputs): workspace.FeedBlob(n, b, device_option=_input_device_options.get( n, device_option)) net = core.Net("opnet") net.Proto().op.extend([op]) test_shape_inference = False try: (shapes, types) = workspace.InferShapesAndTypes([net]) test_shape_inference = True except RuntimeError as e: # Temporarily catch runtime errors when inferring shape # and type info logging.warning(str(e)) if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1': raise e workspace.RunNetOnce(net) reference_outputs = reference(*inputs) if not (isinstance(reference_outputs, tuple) or isinstance(reference_outputs, list)): raise RuntimeError( "You are providing a wrong reference implementation. A " "proper one should return a tuple/list of numpy arrays.") if not outputs_to_check: self.assertEqual(len(reference_outputs), len(op.output)) outputs_to_check = list(range(len(op.output))) outs = [] for (output_index, ref) in zip(outputs_to_check, reference_outputs): output_blob_name = op.output[output_index] output = workspace.FetchBlob(output_blob_name) if output.dtype.kind in ('S', 'O'): np.testing.assert_array_equal(output, ref) else: if atol is None: atol = threshold np.testing.assert_allclose( output, ref, atol=atol, rtol=threshold, err_msg=( 'Output {0} is not matching the reference'.format( output_blob_name, )), ) if test_shape_inference: self._assertInferTensorChecks(output_blob_name, shapes, types, output) outs.append(output) if grad_reference is not None: assert output_to_grad is not None, \ "If grad_reference is set," \ "output_to_grad has to be set as well" with core.DeviceScope(device_option): self._assertGradReferenceChecks(op, inputs, reference_outputs, output_to_grad, grad_reference, threshold=threshold) return outs
def _prepare_rnn(t, n, dim_in, create_rnn, outputs_with_grads, forget_bias, memory_optim=False, forward_only=False, drop_states=False, T=None, two_d_initial_states=None, dim_out=None, num_states=2, **kwargs): if dim_out is None: dim_out = [dim_in] print("Dims: ", t, n, dim_in, dim_out) model = ModelHelper(name='external') if two_d_initial_states is None: two_d_initial_states = np.random.randint(2) def generate_input_state(n, d): if two_d_initial_states: return np.random.randn(n, d).astype(np.float32) else: return np.random.randn(1, n, d).astype(np.float32) states = [] for layer_id, d in enumerate(dim_out): for i in range(num_states): state_name = "state_{}/layer_{}".format(i, layer_id) states.append(model.net.AddExternalInput(state_name)) workspace.FeedBlob(states[-1], generate_input_state(n, d).astype(np.float32)) # Due to convoluted RNN scoping logic we make sure that things # work from a namescope with scope.NameScope("test_name_scope"): input_blob, seq_lengths = model.net.AddScopedExternalInputs( 'input_blob', 'seq_lengths') outputs = create_rnn(model, input_blob, seq_lengths, states, dim_in=dim_in, dim_out=dim_out, scope="external/recurrent", outputs_with_grads=outputs_with_grads, memory_optimization=memory_optim, forget_bias=forget_bias, forward_only=forward_only, drop_states=drop_states, static_rnn_unroll_size=T, **kwargs) workspace.RunNetOnce(model.param_init_net) workspace.FeedBlob( seq_lengths, np.random.randint(1, t + 1, size=(n, )).astype(np.int32)) return outputs, model.net, states + [input_blob]
def test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() axis = 1 dims = np.array(([batch_size, size])) X = np.random.uniform(size=dims).astype(np.float32) - 0.5 gamma = np.random.randn(*X.shape[axis:]).astype(np.float32) beta = np.random.randn(*X.shape[axis:]).astype(np.float32) Y = self._layernorm_transform(X) scale, zp = self._get_scale_zp(Y) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "gamma", "beta"]) pred_net.external_output.extend(["Y_q"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) pred_net.op.add().CopyFrom( core.CreateOperator("Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp)) print(pred_net) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X", "gamma", "beta"]) pred_net_ref.external_output.extend(["Y_q"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormInt8QuantizeFakeNNPI", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y_q", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine, Y_scale=scale, Y_zero_point=zp)) shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hits, debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("gamma", gamma) workspace.FeedBlob("beta", beta) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchInt8Blob("Y_q") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchInt8Blob("Y_q") if not np.allclose(Y_glow.data, Y_c2.data) or \ Y_glow.scale != Y_c2.scale or Y_glow.zero_point != Y_c2.zero_point: diff_Y = np.abs( Y_glow.data.astype(np.float32) - Y_c2.data.astype(np.float32)) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "batch_size": batch_size, "epsilon": epsilon, "gamma": gamma, "beta": beta, "elementwise_affine": elementwise_affine, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff_Y": diff_Y, }) assert (0)
def init_fun(worker_coordinator, global_coordinator): workspace.FeedBlob('data', 'initialized')
def test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() axis = 1 dims = np.array(([batch_size, size])) X = np.random.uniform(size=dims).astype(np.float32) - 0.5 gamma = np.random.randn(*X.shape[axis:]).astype(np.float32) beta = np.random.randn(*X.shape[axis:]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "gamma", "beta"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X", "gamma", "beta"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16NNPI", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hits, debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("gamma", gamma) workspace.FeedBlob("beta", beta) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") dims1 = np.array(([1, *dims])) X_glow = X.reshape(dims1) workspace.FeedBlob("X", X_glow) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_c2): diff_Y = np.abs(Y_glow - Y_c2) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "batch_size": batch_size, "epsilon": epsilon, "gamma": gamma, "beta": beta, "elementwise_affine": elementwise_affine, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff_Y": diff_Y, }) assert (0)
optimizer.build_sgd(train_model,base_learning_rate=0.01, policy="step", stepsize=15000, gamma=0.1, momentum=0.9) print "Added training operators" ################################################################################## #### Run the training procedure # Initialization. print "Initializing dataset" train_dataset = jdh.Jester_Dataset(dictionary_file=train_dictionary,seq_size=10) print "finished initializing dataset" # Prime the workspace with some data so we can run init net once for image, label in train_dataset.read(batch_size=1): workspace.FeedBlob("data", image) workspace.FeedBlob("label", label) break print "Running param init net once" # run the param init network once workspace.RunNetOnce(train_model.param_init_net) # create the network workspace.CreateNet(train_model.net, overwrite=True) # Set the total number of iterations and track the accuracy and loss accuracy = [] loss = [] cnt = 0 print "Beginning training"
from __future__ import division from __future__ import print_function from __future__ import unicode_literals from caffe2.python import core, workspace import glob import json import numpy as np example_files = glob.glob('example_*.c2s') for ex in example_files: print('Running example file', ex) with open(ex, 'r') as f: inits = json.loads(f.readline()) net_name = f.readline().strip() outputs = json.loads(f.readline()) CU = core.C.CompilationUnit() CU.define(f.read()) # Initialize workspace with required inputs for name, shape, dt in inits: workspace.FeedBlob(name, np.random.rand(*shape).astype(np.dtype(dt))) net = CU.create_net(net_name) net.run() print('Success! Interesting outputs:') for output in outputs: print(output, workspace.FetchBlob(output))
def testShapeInferenceTwoClass(self): model = cnn.CNNModelHelper(name="twoclass") model.MakeTwoClass("v", "v2") workspace.FeedBlob("v", np.random.rand(32).astype(np.float32)) self.InferTensorRunAndCompare(model)
def initialize_gpu_from_weights_file(model, weights_file, gpu_id=0): """Initialize a network with ops on a specific GPU. If you use CUDA_VISIBLE_DEVICES to target specific GPUs, Caffe2 will automatically map logical GPU ids (starting from 0) to the physical GPUs specified in CUDA_VISIBLE_DEVICES. """ logger.info('Loading weights from: {}'.format(weights_file)) ws_blobs = workspace.Blobs() src_blobs = load_object(weights_file) if 'cfg' in src_blobs: saved_cfg = load_cfg(src_blobs['cfg']) configure_bbox_reg_weights(model, saved_cfg) if 'blobs' in src_blobs: # Backwards compat--dictionary used to be only blobs, now they are # stored under the 'blobs' key src_blobs = src_blobs['blobs'] # Initialize weights on GPU gpu_id only unscoped_param_names = OrderedDict() # Print these out in model order for blob in model.params: unscoped_param_names[c2_utils.UnscopeName(str(blob))] = True with c2_utils.NamedCudaScope(gpu_id): for unscoped_param_name in unscoped_param_names.keys(): if (unscoped_param_name.find(']_') >= 0 and unscoped_param_name not in src_blobs): # Special case for sharing initialization from a pretrained # model: # If a blob named '_[xyz]_foo' is in model.params and not in # the initialization blob dictionary, then load source blob # 'foo' into destination blob '_[xyz]_foo' src_name = unscoped_param_name[unscoped_param_name.find(']_') + 2:] else: src_name = unscoped_param_name if src_name not in src_blobs: logger.info('{:s} not found'.format(src_name)) continue dst_name = core.ScopedName(unscoped_param_name) has_momentum = src_name + '_momentum' in src_blobs has_momentum_str = ' [+ momentum]' if has_momentum else '' logger.info( '{:s}{:} loaded from weights file into {:s}: {}'.format( src_name, has_momentum_str, dst_name, src_blobs[src_name].shape)) if dst_name in ws_blobs: # If the blob is already in the workspace, make sure that it # matches the shape of the loaded blob ws_blob = workspace.FetchBlob(dst_name) assert ws_blob.shape == src_blobs[src_name].shape, \ ('Workspace blob {} with shape {} does not match ' 'weights file shape {}').format( src_name, ws_blob.shape, src_blobs[src_name].shape) workspace.FeedBlob( dst_name, src_blobs[src_name].astype(np.float32, copy=False)) if has_momentum: workspace.FeedBlob( dst_name + '_momentum', src_blobs[src_name + '_momentum'].astype(np.float32, copy=False)) # We preserve blobs that are in the weights file but not used by the current # model. We load these into CPU memory under the '__preserve__/' namescope. # These blobs will be stored when saving a model to a weights file. This # feature allows for alternating optimization of Faster R-CNN in which blobs # unused by one step can still be preserved forward and used to initialize # another step. for src_name in src_blobs.keys(): if (src_name not in unscoped_param_names and not src_name.endswith('_momentum') and src_blobs[src_name] is not None): with c2_utils.CpuScope(): workspace.FeedBlob('__preserve__/{:s}'.format(src_name), src_blobs[src_name]) logger.info( '{:s} preserved in workspace (unused)'.format(src_name))
def testShapeInferenceReshape(self): model = cnn.CNNModelHelper() model.Reshape("X", ["Reshaped", "Old_Shape"], shape=[8, 0, -1, 2]) workspace.FeedBlob("X", np.random.rand(4, 26, 32).astype(np.float32)) self.InferTensorRunAndCompare(model)
def _test_index_ops(self, entries, dtype, index_create_op): workspace.RunOperatorOnce( core.CreateOperator(index_create_op, [], ['index'], max_elements=10)) my_entries = np.array([entries[0], entries[1], entries[2]], dtype=dtype) workspace.FeedBlob('entries', my_entries) workspace.RunOperatorOnce( core.CreateOperator('IndexLoad', ['index', 'entries'], ['index'])) query1 = np.array([entries[0], entries[3], entries[0], entries[4]], dtype=dtype) workspace.FeedBlob('query1', query1) workspace.RunOperatorOnce( core.CreateOperator('IndexGet', ['index', 'query1'], ['result1'])) result1 = workspace.FetchBlob('result1') np.testing.assert_array_equal([1, 4, 1, 5], result1) workspace.RunOperatorOnce( core.CreateOperator('IndexFreeze', ['index'], ['index'])) query2 = np.array( [entries[5], entries[4], entries[0], entries[6], entries[7]], dtype=dtype) workspace.FeedBlob('query2', query2) workspace.RunOperatorOnce( core.CreateOperator('IndexGet', ['index', 'query2'], ['result2'])) result2 = workspace.FetchBlob('result2') np.testing.assert_array_equal([0, 5, 1, 0, 0], result2) workspace.RunOperatorOnce( core.CreateOperator('IndexSize', ['index'], ['index_size'])) size = workspace.FetchBlob('index_size') self.assertEquals(size, 6) workspace.RunOperatorOnce( core.CreateOperator('IndexStore', ['index'], ['stored_entries'])) stored_actual = workspace.FetchBlob('stored_entries') new_entries = np.array([entries[3], entries[4]], dtype=dtype) np.testing.assert_array_equal( np.concatenate((my_entries, new_entries)), stored_actual) workspace.RunOperatorOnce( core.CreateOperator(index_create_op, [], ['index2'])) workspace.RunOperatorOnce( core.CreateOperator('IndexLoad', ['index2', 'stored_entries'], ['index2'], skip_first_entry=1)) workspace.RunOperatorOnce( core.CreateOperator('IndexSize', ['index2'], ['index2_size'])) index2_size = workspace.FetchBlob('index2_size') self.assertEquals(index2_size, 5) # test serde with tempfile.NamedTemporaryFile() as tmp: workspace.RunOperatorOnce( core.CreateOperator('Save', ['index'], [], absolute_path=1, db_type='minidb', db=tmp.name)) # frees up the blob workspace.FeedBlob('index', np.array([])) # reloads the index workspace.RunOperatorOnce( core.CreateOperator('Load', [], ['index'], absolute_path=1, db_type='minidb', db=tmp.name)) query3 = np.array( [entries[0], entries[3], entries[0], entries[4], entries[4]], dtype=dtype) workspace.FeedBlob('query3', query3) workspace.RunOperatorOnce( core.CreateOperator('IndexGet', ['index', 'query3'], ['result3'])) result3 = workspace.FetchBlob('result3') np.testing.assert_array_equal([1, 4, 1, 5, 5], result3)
def test_int8_quantize(self, n, rand_seed): print("n={}, rand_seed={}".format(n, rand_seed)) np.random.seed(rand_seed) workspace.ResetWorkspace() X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32) W_fp32 = np.identity(n, dtype=np.float32) b_fp32 = np.zeros((n, ), dtype=np.float32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W"], ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, )) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI(["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["Y_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net.Int8DequantizeNNPI(["Y_int8"], ["Y"]) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchBlob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b"], ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_fbgemm): diff_Y = np.abs(Y_glow - Y_fbgemm) print_test_debug_info( "int8_fc", { "seed": rand_seed, "n": n, "X": X_fp32, "W": W_fp32, "b": b_fp32, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "maxdiff": diff_Y.max(axis=1), }, ) assert 0
def compare_reference( self, raw_data, pct_raw_data, pct_mapping, pct_upper, pct_lower, lengths, ): def bisect_percentile_op_ref( raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths ): results = np.zeros_like(raw_data) indices = [0] for j in range(len(lengths)): indices.append(indices[j] + lengths[j]) for i in range(len(raw_data)): for j in range(len(raw_data[0])): start = indices[j] end = indices[j + 1] val = raw_data[i][j] pct_raw_data_i = pct_raw_data[start:end] pct_lower_i = pct_lower[start:end] pct_upper_i = pct_upper[start:end] pct_mapping_i = pct_mapping[start:end] # Corner cases if val < pct_raw_data_i[0]: results[i][j] = 0 continue if val > pct_raw_data_i[-1]: results[i][j] = 1. continue # interpolation k = bisect.bisect_left(pct_raw_data_i, val) if pct_raw_data_i[k] == val: results[i][j] = pct_mapping_i[k] else: k = k - 1 slope = ((pct_lower_i[k + 1] - pct_upper_i[k]) / (pct_raw_data_i[k + 1] - pct_raw_data_i[k])) results[i][j] = pct_upper_i[k] + \ slope * (val - pct_raw_data_i[k]) return results workspace.ResetWorkspace() workspace.FeedBlob("raw_data", raw_data) op = core.CreateOperator( "BisectPercentile", ["raw_data"], ["pct_output"], percentile_raw=pct_raw_data, percentile_mapping=pct_mapping, percentile_lower=pct_lower, percentile_upper=pct_upper, lengths=lengths ) workspace.RunOperatorOnce(op) expected_output = bisect_percentile_op_ref( raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths ) output = workspace.blobs['pct_output'] np.testing.assert_array_almost_equal(output, expected_output)
def im_detect_bbox(model, im, timers=None): """Generate RetinaNet detections on a single image.""" if timers is None: timers = defaultdict(Timer) # Although anchors are input independent and could be precomputed, # recomputing them per image only brings a small overhead anchors = _create_cell_anchors() timers['im_detect_bbox'].tic() k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS) inputs = {} inputs['data'], im_scale, inputs['im_info'] = \ blob_utils.get_image_blob(im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE) cls_probs, box_preds = [], [] for lvl in range(k_min, k_max + 1): suffix = 'fpn{}'.format(lvl) cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix))) box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix))) for k, v in list(inputs.items()): workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False)) workspace.RunNet(model.net.Proto().name) cls_probs = workspace.FetchBlobs(cls_probs) box_preds = workspace.FetchBlobs(box_preds) # here the boxes_all are [x0, y0, x1, y1, score] boxes_all = defaultdict(list) cnt = 0 for lvl in range(k_min, k_max + 1): # create cell anchors array stride = 2. ** lvl cell_anchors = anchors[lvl] # fetch per level probability cls_prob = cls_probs[cnt] box_pred = box_preds[cnt] cls_prob = cls_prob.reshape(( cls_prob.shape[0], A, int(cls_prob.shape[1] / A), cls_prob.shape[2], cls_prob.shape[3])) box_pred = box_pred.reshape(( box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3])) cnt += 1 if cfg.RETINANET.SOFTMAX: cls_prob = cls_prob[:, :, 1::, :, :] cls_prob_ravel = cls_prob.ravel() # In some cases [especially for very small img sizes], it's possible that # candidate_ind is empty if we impose threshold 0.05 at all levels. This # will lead to errors since no detections are found for this image. Hence, # for lvl 7 which has small spatial resolution, we take the threshold 0.0 th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0 candidate_inds = np.where(cls_prob_ravel > th)[0] if (len(candidate_inds) == 0): continue pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N, len(candidate_inds)) inds = np.argpartition( cls_prob_ravel[candidate_inds], -pre_nms_topn)[-pre_nms_topn:] inds = candidate_inds[inds] inds_5d = np.array(np.unravel_index(inds, cls_prob.shape)).transpose() classes = inds_5d[:, 2] anchor_ids, y, x = inds_5d[:, 1], inds_5d[:, 3], inds_5d[:, 4] scores = cls_prob[:, anchor_ids, classes, y, x] boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32) boxes *= stride boxes += cell_anchors[anchor_ids, :] if not cfg.RETINANET.CLASS_SPECIFIC_BBOX: box_deltas = box_pred[0, anchor_ids, :, y, x] else: box_cls_inds = classes * 4 box_deltas = np.vstack( [box_pred[0, ind:ind + 4, yi, xi] for ind, yi, xi in zip(box_cls_inds, y, x)] ) pred_boxes = ( box_utils.bbox_transform(boxes, box_deltas) if cfg.TEST.BBOX_REG else boxes) pred_boxes /= im_scale pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape) box_scores = np.zeros((pred_boxes.shape[0], 5)) box_scores[:, 0:4] = pred_boxes box_scores[:, 4] = scores for cls in range(1, cfg.MODEL.NUM_CLASSES): inds = np.where(classes == cls - 1)[0] if len(inds) > 0: boxes_all[cls].extend(box_scores[inds, :]) timers['im_detect_bbox'].toc() # Combine predictions across all levels and retain the top scoring by class timers['misc_bbox'].tic() detections = [] for cls, boxes in list(boxes_all.items()): cls_dets = np.vstack(boxes).astype(dtype=np.float32) # do class specific nms here keep = box_utils.nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep, :] out = np.zeros((len(keep), 6)) out[:, 0:5] = cls_dets out[:, 5].fill(cls) detections.append(out) # detections (N, 6) format: # detections[:, :4] - boxes # detections[:, 4] - scores # detections[:, 5] - classes detections = np.vstack(detections) # sort all again inds = np.argsort(-detections[:, 4]) detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :] # Convert the detections to image cls_ format (see core/test_engine.py) num_classes = cfg.MODEL.NUM_CLASSES cls_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)] for c in range(1, num_classes): inds = np.where(detections[:, 5] == c)[0] cls_boxes[c] = detections[inds, :5] timers['misc_bbox'].toc() return cls_boxes
def _run_compare_train_inference(self, model_params): tmp_dir = tempfile.mkdtemp() model_obj, checkpoint_path = self._build_seq2seq_model( model_params, tmp_dir=tmp_dir, source_vocab_size=20, target_vocab_size=20, num_gpus=0, batch_size=2, ) assert model_obj is not None translate_params = dict( ensemble_models=[ dict( source_vocab={i: str(i) for i in range(20)}, target_vocab={i: str(i) for i in range(20)}, model_params=model_params, model_file=checkpoint_path, ) ], decoding_params=dict( beam_size=3, word_reward=0, unk_reward=0, ), ) beam_decoder_model = Seq2SeqModelCaffe2EnsembleDecoder( translate_params) beam_decoder_model.load_models() encoder_lengths = 5 decoder_lengths = 7 for _ in range(3): encoder_inputs = np.random.random_integers( low=3, # after GO_ID (1) and EOS_ID (2) high=19, size=encoder_lengths, ) targets, _, beam_model_score = beam_decoder_model.decode( encoder_inputs, decoder_lengths, ) targets_2, _, beam_model_score = beam_decoder_model.decode( encoder_inputs, decoder_lengths, ) self.assertEqual(targets, targets_2) workspace.FeedBlob( 'encoder_inputs', np.array([list(reversed(encoder_inputs)) ]).transpose().astype(dtype=np.int32)) workspace.FeedBlob( 'encoder_lengths', np.array([len(encoder_inputs)]).astype(dtype=np.int32), ) decoder_inputs = [seq2seq_util.GO_ID] + targets[:-1] workspace.FeedBlob( 'decoder_inputs', np.array([decoder_inputs]).transpose().astype(dtype=np.int32), ) workspace.FeedBlob( 'decoder_lengths', np.array([len(decoder_inputs)]).astype(dtype=np.int32), ) workspace.FeedBlob( 'targets', np.array([targets]).transpose().astype(dtype=np.int32), ) workspace.FeedBlob( 'target_weights', np.array([[1.0] * len(targets)]).astype(dtype=np.float32), ) workspace.RunNet(model_obj.forward_net) train_model_score = workspace.FetchBlob('total_loss_scalar') np.testing.assert_almost_equal( beam_model_score, train_model_score, decimal=4, )
def test_layernorm(self, seed, size, input_channels, batch_size, epsilon): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X"], ["Y", "mean", "rstd"], # axis=-1, epsilon=epsilon)) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16", ["X"], ["Y", "mean", "rstd"], # axis=-1, epsilon=epsilon)) X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 pred_net_onnxified = onnxifi_caffe2_net( pred_net, {"X": [batch_size, input_channels, size, size]}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net) workspace.CreateNet(pred_net_ref) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") mean_c2 = workspace.FetchBlob("mean") std_c2 = workspace.FetchBlob("rstd") workspace.RunNet(pred_net.name) Y_glow = workspace.FetchBlob("Y") mean_glow = workspace.FetchBlob("mean") std_glow = workspace.FetchBlob("rstd") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16) diff_std = np.abs(std_glow - std_c2).astype(np.float16) diff_mean = np.abs(mean_glow - mean_c2).astype(np.float16) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "input_channels": input_channels, "batch_size": batch_size, "epsilon": epsilon, "X": X, "Y_glow": Y_glow, "mean_glow": mean_glow, "std_glow": std_glow, "Y_c2": Y_c2, "mean_c2": mean_c2, "std_c2": std_c2, "diff_Y": diff_Y, "diff_mean": diff_mean, "diff_std": diff_std, }) assert (0)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustice_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="mobilenet", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: ''' # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) ''' else: rendezvous = None # Model building functions def create_mobilenet_model_ops(model, loss_scale): [softmax, loss ] = mobilenet.create_mobilenet(model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label") loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(200 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) optimizer.build_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, ) def add_post_sync_ops(model): for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize_GPU( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_mobilenet_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, ) # # # save network graph # graph = net_drawer.GetPydotGraphMinimal( # train_model.net.Proto().op, "mobilenet", rankdir="LR", minimal_dependency=True) # with open("mobilenet.png", 'wb') as fid: # fid.write(graph.create_png()) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="mobilenet_test", arg_scope=test_arg_scope) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_mobilenet_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) epoch = 0 # load the pre-trained model and mobilenet epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) x = workspace.FetchBlob('optimizer_iteration') workspace.FeedBlob('optimizer_iteration', x) # mobilenet epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("mobilenet epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") # else: # workspace.RunNetOnce(train_model.param_init_net) # workspace.CreateNet(train_model.net) expname = "mobilenet_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time # with open(os.path.join(root_folder, "train_net.pbtxt"), 'w') as fo: # fo.write(str(train_model.net.Proto())) # with open(os.path.join(root_folder, "train_init_net.pbtxt"), 'w') as fo: # fo.write(str(train_model.param_init_net.Proto())) # print(workspace.FetchBlob('iteration_mutex')) # print(workspace.FetchBlob('gpu_0/conv1_b')) while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 3) + ".mdl"): os.remove(model_path + str(epoch - 3) + ".mdl") #======= Check Flag ========== CheckSave()