def optimize_gradient_memory_resnet(model, loss): model.net._net = memonger.share_grad_blobs( model.net, loss, set(model.param_to_grad.values()), namescope="", share_activations=False)
def OptimizeGradientMemory(model, input_shapes, excluded_blobs, recycle_activations): """ Optimize memory usage of the backward pass by recycling blobs for gradient inputs that have been 'used'. input_shapes: dict of blob name to shape for the inputs of the model. Pass empty dictionary if not known. excluded_blobs: list of blobs that cannot be recycled. These are blobs that you will access externally. recycle_activations: whether to also recycle forward pass activations """ input_shapes_all_devices = {} for b, shp in input_shapes.items(): for d in model._devices: input_shapes_all_devices["gpu_{}/{}".format(d, b)] = shp (shapes, types) = workspace.InferShapesAndTypes( [model.param_init_net, model.net], input_shapes_all_devices, ) for device in model._devices: namescope = "gpu_{}/".format(device) excluded_blobs_by_device = set([namescope + b for b in excluded_blobs]) model.net._net = memonger.share_grad_blobs( model.net, model._losses_by_gpu[device], set(model.param_to_grad.values()), namescope, dont_share_blobs=excluded_blobs_by_device, share_activations=recycle_activations, blob_shapes=shapes, )
def optimize_gradient_memory(model, loss): model.net._net = memonger.share_grad_blobs( model.net, loss, set(model.param_to_grad.values()), namescope="imonaboat", share_activations=False, )
def OptimizeGradientMemory(model, loss): model.net._net = memonger.share_grad_blobs( model.net, loss, set(model.param_to_grad.values()), namescope="test", share_activations=False, )
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size): m = cnn.CNNModelHelper() with core.NameScope("name_x"): fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = m.FC(fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"]) def count_blobs(proto): blob_set = set() for op in proto.op: for inp in op.input: blob_set.add(inp) for outp in op.output: blob_set.add(outp) return len(blob_set) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss1", "name_x/loss2"], set(m.param_to_grad.values()), "name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/" ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) print(str(optim_proto)) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint( low=0, high=output_dim, size=(batch_size,)).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2) np.testing.assert_almost_equal(grad, optimized_grad)
def _OptimizeGradientMemory(model, losses_by_gpu, devices): for device in devices: namescope = "gpu_{}/".format(device) model.net._net = memonger.share_grad_blobs( model.net, losses_by_gpu[device], set(model.param_to_grad.values()), namescope, )
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size): m = cnn.CNNModelHelper() with core.NameScope("name_x"): fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = m.FC(fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") input_to_grad = m.AddGradientOperators( ["name_x/loss1", "name_x/loss2"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss1", "name_x/loss2"], set(m.param_to_grad.values()), "name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/" share_activations=True, dont_share_blobs=set([ 'name_x/fc6', 'name_x/fc5', str(input_to_grad["name_x/fc1_w"]) ]), ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) self.assertTrue(has_blob(optim_proto, "name_x/fc6")) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2) np.testing.assert_almost_equal(grad, optimized_grad)
def _OptimizeGradientMemory(model, losses_by_gpu, devices): for device in devices: namescope = "gpu_{}/".format(device) model.net._net = memonger.share_grad_blobs( model.net, losses_by_gpu[device], set(model.param_to_grad.values()), namescope, )
def test_rnn(self): from caffe2.python import rnn_cell T = 5 model = model_helper.ModelHelper() seq_lengths, labels = \ model.net.AddExternalInputs( 'seq_lengths', 'labels', ) init_blobs = [] for i in range(2): hidden_init, cell_init = model.net.AddExternalInputs( "hidden_init_{}".format(i), "cell_init_{}".format(i) ) init_blobs.extend([hidden_init, cell_init]) model.param_init_net.ConstantFill([], ["input"], shape=[T, 4, 10]) output, last_hidden, _, last_state = rnn_cell.LSTM( model=model, input_blob="input", seq_lengths=seq_lengths, initial_states=init_blobs, dim_in=10, dim_out=[10, 10], scope="lstm1", forward_only=False, drop_states=True, return_last_layer_only=True, ) softmax, loss = model.net.SoftmaxWithLoss( [model.Flatten(output), "labels"], ['softmax', 'loss'], ) model.AddGradientOperators([loss]) blobs_before = count_blobs(model.net.Proto()) optim_proto = memonger.share_grad_blobs( model.net, ["loss"], set(viewvalues(model.param_to_grad)), "", share_activations=True, dont_share_blobs=set(), ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Run once to see all blobs are set up correctly for init_blob in init_blobs: workspace.FeedBlob(init_blob, np.zeros( [1, 4, 10], dtype=np.float32 )) workspace.FeedBlob("seq_lengths", np.array([T] * 4, dtype=np.int32)) workspace.FeedBlob("labels", np.random.rand(T).astype(np.int32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net)
def test_rnn(self): from caffe2.python import rnn_cell T = 5 model = model_helper.ModelHelper() seq_lengths, labels = \ model.net.AddExternalInputs( 'seq_lengths', 'labels', ) init_blobs = [] for i in range(2): hidden_init, cell_init = model.net.AddExternalInputs( "hidden_init_{}".format(i), "cell_init_{}".format(i) ) init_blobs.extend([hidden_init, cell_init]) model.param_init_net.ConstantFill([], ["input"], shape=[T, 4, 10]) output, last_hidden, _, last_state = rnn_cell.LSTM( model=model, input_blob="input", seq_lengths=seq_lengths, initial_states=init_blobs, dim_in=10, dim_out=[10, 10], scope="lstm1", forward_only=False, drop_states=True, return_last_layer_only=True, ) softmax, loss = model.net.SoftmaxWithLoss( [model.Flatten(output), "labels"], ['softmax', 'loss'], ) model.AddGradientOperators([loss]) blobs_before = count_blobs(model.net.Proto()) optim_proto = memonger.share_grad_blobs( model.net, ["loss"], set(viewvalues(model.param_to_grad)), "", share_activations=True, dont_share_blobs=set(), ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Run once to see all blobs are set up correctly for init_blob in init_blobs: workspace.FeedBlob(init_blob, np.zeros( [1, 4, 10], dtype=np.float32 )) workspace.FeedBlob("seq_lengths", np.array([T] * 4, dtype=np.int32)) workspace.FeedBlob("labels", np.random.rand(T).astype(np.int32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net)
def optimize_memory(model): """Save GPU memory through blob sharing.""" for device in range(cfg.NUM_GPUS): namescope = 'gpu_{}/'.format(device) losses = [namescope + l for l in model.losses] model.net._net = memonger.share_grad_blobs( model.net, losses, set(model.param_to_grad.values()), namescope, share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS)
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss1", "name_x/loss2"], set(viewvalues(m.param_to_grad)), "name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/" share_activations=True, dont_share_blobs=set(['name_x/fc6', 'name_x/fc5', str(input_to_grad["name_x/fc1_w"])]), ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) self.assertTrue(has_blob(optim_proto, "name_x/fc6")) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint( low=0, high=output_dim, size=(batch_size,)).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2) np.testing.assert_almost_equal(grad, optimized_grad)
def _OptimizeGradientMemorySimple(model, losses_by_gpu, devices): log.warning("------- DEPRECATED API, please use " + "data_parallel_model.OptimizeGradientMemory() ----- ") for device in devices: namescope = "gpu_{}/".format(device) model.net._net = memonger.share_grad_blobs( model.net, losses_by_gpu[device], set(model.param_to_grad.values()), namescope, share_activations=False, )
def optimize_memory(model): """Save GPU memory through blob sharing.""" for device in range(cfg.NUM_GPUS): namescope = 'gpu_{}/'.format(device) losses = [namescope + l for l in model.losses] model.net._net = memonger.share_grad_blobs( model.net, losses, set(model.param_to_grad.values()), namescope, share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS )
def test_memonger_mix_cpu_gpu(self): ''' Check that memonger does not make blobs cross CPU/GPU boundary ''' m = model_helper.ModelHelper() with core.DeviceScope( core.DeviceOption( caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)): fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2) fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2) fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2) fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2) fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2) fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2) fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2) fc7_cpu.Relu([], fc7_cpu) \ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") m.AddGradientOperators(["loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["loss"], set(viewvalues(m.param_to_grad)), "", share_activations=True, dont_share_blobs=set(), ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Create set of blobs on CPU side and GPU side and check they don't # overlap if workspace.has_hip_support: device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.HIP: set()} else: device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()} for op in optim_proto.op: if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]: dev = op.device_option.device_type for b in list(op.input) + list(op.output): device_blobs[dev].add(b) device_crossers = device_blobs[caffe2_pb2.CPU].intersection( device_blobs[caffe2_pb2.HIP] if workspace. has_hip_support else device_blobs[caffe2_pb2.CUDA]) self.assertEquals(device_crossers, set())
def optimize_gradient_memory(model, loss): """A naive implementation of memory optimization :param model_helper.ModelHelper model: Model to add update parameters operators for. :param list loss: A list of losses. """ model.net._net = memonger.share_grad_blobs( model.net, loss, set(model.param_to_grad.values()), namescope='', share_activations=False, )
def test_memonger_mix_cpu_gpu(self): ''' Check that memonger does not make blobs cross CPU/GPU boundary ''' m = model_helper.ModelHelper() with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2) fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2) fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2) fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2) fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2) fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2) fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2) fc7_cpu.Relu([], fc7_cpu) \ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") m.AddGradientOperators(["loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["loss"], set(viewvalues(m.param_to_grad)), "", share_activations=True, dont_share_blobs=set(), ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Create set of blobs on CPU side and GPU side and check they don't # overlap device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()} for op in optim_proto.op: if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]: dev = op.device_option.device_type for b in list(op.input) + list(op.output): device_blobs[dev].add(b) device_crossers = device_blobs[caffe2_pb2.CPU].intersection( device_blobs[caffe2_pb2.CUDA] ) self.assertEquals(device_crossers, set())
def optimize_memory(model): """Save GPU memory through blob sharing.""" for device in range(cfg.NUM_GPUS): namescope = 'gpu_{}/'.format(device) losses = [namescope + l for l in model.losses] model.net._net = memonger.share_grad_blobs( model.net, losses, set(model.param_to_grad.values()), namescope, share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS) if cfg.WSL.DEEP_MEM: from detectron.utils.wsl_memonger import share_freeze_blobs model.net._net = share_freeze_blobs( model.net, namescope, )
def optimize_memory_cpg(model, device): namescope = 'gpu_{}/'.format(device) dont_free_blobs = set([ namescope + cfg.WSL.CPG_PRE_BLOB + '_grad', namescope + cfg.WSL.CPG_DATA_BLOB + '_grad' ]) import detectron.utils.cpg_memonger as cpg_memonger model.net._net = cpg_memonger.deep_release_blobs_when_used( model.net._net, dont_free_blobs) return namescope = 'gpu_{}/'.format(device) dont_share_blobs = set([ namescope + cfg.WSL.CPG_PRE_BLOB + '_grad', namescope + cfg.WSL.CPG_DATA_BLOB + '_grad' ]) import detectron.utils.cpg_memonger as cpg_memonger cpg_memonger.deep_share_blobs( model.net, namescope, dont_share_blobs=dont_share_blobs, ) return optimize_memory_cpg_new(model, device) return """Save GPU memory through blob sharing.""" # for device in range(cfg.NUM_GPUS): if device >= 0: namescope = 'gpu_{}/'.format(device) # it seem dont_share_blobs not working dont_share_blobs = set([ namescope + cfg.WSL.CPG_PRE_BLOB + '_grad', namescope + cfg.WSL.CPG_DATA_BLOB + '_grad' ]) losses = [namescope + l for l in model.losses] model.net._net = memonger.share_grad_blobs( model.net, losses, set(model.param_to_grad.values()), namescope, share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS, dont_share_blobs=dont_share_blobs, )
def test_gradient_optim(self, input_dim, output_dim, batch_size): m = cnn.CNNModelHelper() with core.NameScope("name_x"): fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5)\ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") input_to_grad = m.AddGradientOperators(["name_x/loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss = workspace.FetchBlob("name_x/loss") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad)
def test_shared_grads( with_shapes, create_model, conv_blob, last_out_blob, data_blob='gpu_0/data', label_blob='gpu_0/label', num_labels=1000, ): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput(data_blob) label = model.net.AddExternalInput(label_blob) (_softmax, loss) = create_model( model, data, num_input_channels=3, num_labels=num_labels, label=label, is_test=False, ) param_to_grad = model.AddGradientOperators([loss]) (shapes, types) = workspace.InferShapesAndTypes( [model.param_init_net, model.net], { data_blob: [4, 3, 227, 227], label_blob: [4] }, ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.share_grad_blobs( model.net, ["gpu_0/loss"], set(model.param_to_grad.values()), "gpu_0/", share_activations=True, dont_share_blobs=set([str(param_to_grad[conv_blob])]), blob_shapes=shapes if with_shapes else None, ) count_after = count_blobs(optim_proto) # Run model and compare results. We check that the loss is same # and also that the final gradient (conv1_w_grad is same) workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) label = (np.random.rand(4) * num_labels).astype(np.int32) workspace.FeedBlob(data_blob, data) workspace.FeedBlob(label_blob, label) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob(last_out_blob) conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob]) workspace.FeedBlob(param_to_grad[conv_blob], np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob(last_out_blob) optim_conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob]) return [(count_after, count_before), (loss1, optimized_loss1), (conv1_w_grad, optim_conv1_w_grad)]
def test_gradient_optim(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5)\ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") input_to_grad = m.AddGradientOperators(["name_x/loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(viewvalues(m.param_to_grad)), "name_x/", share_activations=False, ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) optim_proto_wacts = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(viewvalues(m.param_to_grad)), "name_x/", share_activations=True, dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]), ) blobs_wact_optim = count_blobs(optim_proto_wacts) self.assertLessEqual(blobs_wact_optim, blobs_after) # Check that the last activations are not shared self.assertTrue(has_blob(optim_proto, "name_x/fc5")) self.assertTrue( has_blob(optim_proto_wacts, "name_x/fc5"), "Dont remap final activation", ) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint( low=0, high=output_dim, size=(batch_size,)).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss = workspace.FetchBlob("name_x/loss") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) # Run with the forward optimization workspace.RunNetOnce(optim_proto_wacts) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad)
def test_resnet_shared_grads(self, with_shapes, gc, dc): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput("gpu_0/data") label = model.net.AddExternalInput("gpu_0/label") (_softmax, loss) = resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, label=label, is_test=False, ) param_to_grad = model.AddGradientOperators([loss]) (shapes, types) = workspace.InferShapesAndTypes( [model.param_init_net, model.net], {'gpu_0/data': [4, 3, 227, 227], 'gpu_0/label': [4]}, ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.share_grad_blobs( model.net, ["gpu_0/loss"], set(model.param_to_grad.values()), "gpu_0/", share_activations=True, dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]), blob_shapes=shapes if with_shapes else None, ) count_after = count_blobs(optim_proto) self.assertTrue(count_after < count_before) # Run model and compare results. We check that the loss is same # and also that the final gradient (conv1_w_grad is same) workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) label = (np.random.rand(4) * 1000).astype(np.int32) workspace.FeedBlob("gpu_0/data", data) workspace.FeedBlob("gpu_0/label", label) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"]) workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"]) print("before: {} after: {}".format(count_before, count_after)) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)
def test_gradient_optim(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5)\ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") input_to_grad = m.AddGradientOperators(["name_x/loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", share_activations=False, ) self.assertTrue(memonger.verify_graph_equality(m.Proto(), optim_proto)) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) optim_proto_wacts = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", share_activations=True, dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]), ) self.assertTrue( memonger.verify_graph_equality(m.Proto(), optim_proto_wacts)) blobs_wact_optim = count_blobs(optim_proto_wacts) self.assertLessEqual(blobs_wact_optim, blobs_after) # Check that the last activations are not shared self.assertTrue(has_blob(optim_proto, "name_x/fc5")) self.assertTrue( has_blob(optim_proto_wacts, "name_x/fc5"), "Dont remap final activation", ) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss = workspace.FetchBlob("name_x/loss") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) # Run with the forward optimization workspace.RunNetOnce(optim_proto_wacts) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad)