def test_equiv(self): ''' Test that the model produces exactly same results given total batchsize, independent of number of GPUs. ''' for gpu in [True, False]: if gpu and (not workspace.has_gpu_support or workspace.NumCudaDevices() < 2): continue result_2gpus = self.run_model([0, 1], gpu=gpu) result_1gpus = self.run_model([0], gpu=gpu) self.assertTrue(np.allclose(result_1gpus, result_2gpus)) if not gpu or workspace.NumCudaDevices() >= 4: result_4gpus = self.run_model(list(range(4)), gpu=gpu) self.assertTrue(np.allclose(result_1gpus, result_4gpus)) if not gpu or workspace.NumCudaDevices() >= 8: result_8gpus = self.run_model(list(range(8)), gpu=gpu) self.assertTrue(np.allclose(result_1gpus, result_8gpus)) if not gpu or workspace.NumCudaDevices() >= 16: result_16gpus = self.run_model(list(range(16)), gpu=gpu) self.assertTrue(np.allclose(result_1gpus, result_16gpus))
def test_sparse_shared_indices_gpu(self): ''' Test that the model has same number of indices and gradient rows given total batchsize, independent of number of GPUs. ''' V = 10000 self.run_model(V, [0, 1]) self.run_model(V, [0]) if workspace.NumCudaDevices() >= 4: self.run_model(V, list(range(4))) if workspace.NumCudaDevices() >= 8: self.run_model(V, list(range(8)))
def get_device_option(gpu=None): """Constructs `core.DeviceOption` object :param int gpu: Identifier of GPU to use or None for CPU. :return: Instance of `core.DeviceOption`. """ dev_opt = None if gpu is None: dev_opt = core.DeviceOption(caffe2_pb2.CPU) else: assert workspace.has_gpu_support, "Workspace does not support GPUs" assert gpu >= 0 and gpu < workspace.NumCudaDevices(),\ "Workspace does not provide this gpu (%d). "\ "Number of GPUs is %d" % (gpu, workspace.NumCudaDevices()) dev_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu) return dev_opt
def _test_reshape(old_shape, new_shape, expected_shape=None, arg_shape=True, in_place=False): devices = [core.DeviceOption(caffe2_pb2.CPU, 0)] if workspace.NumCudaDevices() > 0: devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0)) for device_opt in devices: with core.DeviceScope(device_opt): if expected_shape is None: expected_shape = new_shape X = np.random.rand(*old_shape).astype(np.float32) blob_in = 'X' blob_out = blob_in if in_place else blob_in + '_out' if arg_shape: op = core.CreateOperator('Reshape', [blob_in], [blob_out, 'old_shape'], shape=new_shape) else: op = core.CreateOperator('Reshape', [blob_in, 'new_shape'], [blob_out, 'old_shape']) workspace.FeedBlob('new_shape', np.asarray(new_shape)) workspace.FeedBlob(blob_in, X) workspace.RunOperatorOnce(op) Y = workspace.FetchBlob(blob_out) np.testing.assert_allclose(Y, X.reshape(expected_shape))
def test_shift_gpu(self): model = self.create_model() data_parallel_model_utils.ShiftActivationDevices( model, activations=["fc4", "fc5"], shifts={ 0: 4, 1: 4, 2: 5, 3: 5 }, ) for op in model.param_init_net.Proto().op: for outp in op.output: prefix = outp.split("/")[0] if outp.split("/")[-1] in set( ['fc4_w', 'fc5_w', 'fc4_b', 'fc5_b']): if prefix == 'gpu_0' or prefix == 'gpu_1': self.assertEqual(op.device_option.cuda_gpu_id, 4) else: self.assertEqual(op.device_option.cuda_gpu_id, 5) if outp.split("/")[-1] in set( ['fc1_w', 'fc2_w', 'fc3_b', 'fc3_w']): gpu_id = int(prefix.split("_")[-1]) self.assertEqual(gpu_id, op.device_option.cuda_gpu_id) # Test that we can run the net if workspace.NumCudaDevices() >= 6: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name)
def test_prepend_dim(self): devices = [core.DeviceOption(caffe2_pb2.CPU, 0)] if workspace.NumCudaDevices() > 0: devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0)) for device_opt in devices: with core.DeviceScope(device_opt): self._test_fwd_bwd()
def test_equiv(self): ''' Test that the model produces exactly same results given total batchsize, independent of number of GPUs. ''' result_2gpus = self.run_model([0, 1]) result_1gpus = self.run_model([0]) self.assertTrue(np.allclose(result_1gpus, result_2gpus)) if workspace.NumCudaDevices() >= 4: result_4gpus = self.run_model(range(4)) self.assertTrue(np.allclose(result_1gpus, result_4gpus)) if workspace.NumCudaDevices() >= 8: result_8gpus = self.run_model(range(8)) self.assertTrue(np.allclose(result_1gpus, result_8gpus))
class CudaProfileOpsTest(unittest.TestCase): @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU") def test_run(self): net = core.Net("net") net.CudaProfileInitialize([], [], output="/tmp/cuda_profile_test") net.CudaProfileStart([], []) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): net.ConstantFill([], ["out"], shape=[1, 3, 244, 244]) net.CudaProfileStop([], []) workspace.CreateNet(net) workspace.RunNet(net)
def _test_equiv_sparse(self, cpu_indices): ''' Test that the model produces exactly same results given total batchsize, independent of number of GPUs. ''' V = 10000 result_2gpus = self.run_model(V, [0, 1], cpu_indices) result_1gpus = self.run_model(V, [0], cpu_indices) self.assertTrue(np.allclose(result_1gpus[0], result_2gpus[0])) self.assertTrue(np.allclose(result_1gpus[1], result_2gpus[1])) if workspace.NumCudaDevices() >= 4: result_4gpus = self.run_model(V, range(4), cpu_indices) self.assertTrue(np.allclose(result_1gpus[0], result_4gpus[0])) self.assertTrue(np.allclose(result_1gpus[1], result_4gpus[1])) if workspace.NumCudaDevices() >= 8: result_8gpus = self.run_model(V, range(8), cpu_indices) self.assertTrue(np.allclose(result_1gpus[0], result_8gpus[0])) self.assertTrue(np.allclose(result_1gpus[1], result_8gpus[1]))
def test_executor(self, executor, num_workers): model = build_resnet50_dataparallel_model( num_gpus=workspace.NumCudaDevices(), batch_size=8, epoch_size=8) model.Proto().num_workers = num_workers def run_model(): run_resnet50_epoch(model, batch_size=8, epoch_size=8) self.compare_executors( model, ref_executor="simple", test_executor=executor, model_run_func=run_model, )
def test_timings(self): for n in range(2, workspace.NumCudaDevices()): for in_place in [False, True]: xs = [np.random.randn(1e7).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i)) self.ws.run(net) net_time = benchmark(self.ws, net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(self.ws, vanilla) print("Speedup for NCCL: {:.2f}".format( vanilla_time / net_time))
def getArgs(): """Return command-line arguments.""" CURDIR = os.path.dirname(__file__) parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--train-lmdb', help='Path to training LMDB', default=os.path.join(CURDIR, 'cifar10_train_lmdb')) parser.add_argument('--test-lmdb', help='Path to test LMDB', default=os.path.join(CURDIR, 'cifar10_test_lmdb')) parser.add_argument('--dtype', choices=['float', 'float16'], default='float', help='Data type used for training') parser.add_argument('--gpus', help='Comma separated list of GPU devices to use') parser.add_argument('--num_gpus', type=int, default=1, help='Number of GPU devices (instead of --gpus)') parser.add_argument('--all-gpus', action='store_true', help='Use all GPUs in the system') args = parser.parse_args() args.dtype = (DataType.FLOAT16 if args.dtype == 'float16' else DataType.FLOAT) if args.all_gpus: args.num_gpus = workspace.NumCudaDevices() args.gpus = range(args.num_gpus) else: if args.gpus is not None: args.gpus = [int(x) for x in args.gpus.split(',')] args.num_gpus = len(args.gpus) else: args.gpus = range(args.num_gpus) args.num_gpus = args.num_gpus return args
def test_timings(self): for n in range(2, workspace.NumCudaDevices()): for in_place in [False, True]: xs = [ np.random.randn(1e7).astype(np.float32) for i in range(n) ] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): workspace.FeedBlob(inputs[i], xs[i], gpu_device(i).SerializeToString()) workspace.RunNetOnce(net.Proto().SerializeToString()) net_time = benchmark(net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(vanilla) print("Speedup for NCCL: {:.2f}".format(vanilla_time / net_time))
from __future__ import unicode_literals import errno import hypothesis.strategies as st from hypothesis import given import numpy as np import os import shutil import tempfile import unittest from caffe2.proto import caffe2_pb2 from caffe2.python import core, test_util, workspace if workspace.has_gpu_support: DEVICES = [caffe2_pb2.CPU, caffe2_pb2.CUDA] max_gpuid = workspace.NumCudaDevices() - 1 else: DEVICES = [caffe2_pb2.CPU] max_gpuid = 0 # Utility class for other loading tests, don't add test functions here # Inherit from this test instead. If you add a test here, # each derived class will inherit it as well and cause test duplication class TestLoadSaveBase(test_util.TestCase): def __init__(self, methodName, db_type='minidb'): super(TestLoadSaveBase, self).__init__(methodName) self._db_type = db_type @given(src_device_type=st.sampled_from(DEVICES),
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions # def create_resnet50_model_ops(model, loss_scale): # initializer = (PseudoFP16Initializer if args.dtype == 'float16' # else Initializer) # with brew.arg_scope([brew.conv, brew.fc], # WeightInitializer=initializer, # BiasInitializer=initializer, # enable_tensor_core=args.enable_tensor_core, # float16_compute=args.float16_compute): # pred = resnet.create_resnet50( # #args.layers, # model, # "data", # num_input_channels=args.num_channels, # num_labels=args.num_labels, # no_bias=True, # no_loss=True, # ) # if args.dtype == 'float16': # pred = model.net.HalfToFloat(pred, pred + '_fp32') # softmax, loss = model.SoftmaxWithLoss([pred, 'label'], # ['softmax', 'loss']) # loss = model.Scale(loss, scale=loss_scale) # brew.accuracy(model, [softmax, "label"], "accuracy") # return [loss] def create_model_ops(model, loss_scale): return create_model_ops_testable(model, loss_scale, is_test=False) def create_model_ops_test(model, loss_scale): return create_model_ops_testable(model, loss_scale, is_test=True) # Model building functions def create_model_ops_testable(model, loss_scale, is_test=False): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): if args.model == "cifar10": if args.image_size != 32: log.warn("Cifar10 expects a 32x32 image.") pred = models.cifar10.create_cifar10( model, "data", image_channels=args.num_channels, num_classes=args.num_labels, image_height=args.image_size, image_width=args.image_size, ) elif args.model == "resnet32x32": if args.image_size != 32: log.warn("ResNet32x32 expects a 32x32 image.") pred = models.resnet.create_resnet32x32( model, "data", num_layers=args.num_layers, num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "resnet": if args.image_size != 224: log.warn( "ResNet expects a 224x224 image. input image = %d" % args.image_size) pred = resnet.create_resnet50( #args.layers, model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) elif args.model == "vgg": if args.image_size != 224: log.warn("VGG expects a 224x224 image.") pred = vgg.create_vgg(model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, num_layers=args.num_layers, is_test=is_test) elif args.model == "googlenet": if args.image_size != 224: log.warn("GoogLeNet expects a 224x224 image.") pred = googlenet.create_googlenet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "alexnet": if args.image_size != 224: log.warn("Alexnet expects a 224x224 image.") pred = alexnet.create_alexnet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "alexnetv0": if args.image_size != 224: log.warn("Alexnet v0 expects a 224x224 image.") pred = alexnet.create_alexnetv0( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) else: raise NotImplementedError("Network {} not found.".format( args.model)) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-prceision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) print("info:===============================" + str(opt)) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize(train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, use_nccl=args.use_nccl) if args.model_parallel: # Shift half of the activations to another GPU assert workspace.NumCudaDevices() >= 2 * args.num_gpus activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) if "GLOO_ALGORITHM" in os.environ and os.environ[ "GLOO_ALGORITHM"] == "PHUB": #i need to communicate to PHub about the elements that need aggregation, #as well as their sizes. #at this stage, all i need is the name of keys and my key ID. grad_names = list(reversed(train_model._grad_names)) phubKeyNames = ["allreduce_{}_status".format(x) for x in grad_names] caffe2GradSizes = dict( zip([ data_parallel_model.stripBlobName(name) + "_grad" for name in train_model._parameters_info.keys() ], [x.size for x in train_model._parameters_info.values()])) phubKeySizes = [str(caffe2GradSizes[x]) for x in grad_names] if rendezvous["shard_id"] == 0: #only id 0 needs to send to rendezvous. r = redis.StrictRedis() #foreach key, I need to assign an ID joinedStr = ",".join(phubKeyNames) r.set("[PLink]IntegrationKeys", joinedStr) joinedStr = ",".join(phubKeySizes) r.set("[PLink]IntegrationKeySizes", joinedStr) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops_test, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
from caffe2.python import ( brew, core, device_checker, gradient_checker, model_helper, test_util, workspace, ) from caffe2.python.gradient_checker import NetGradientChecker from caffe2.python.net_builder import ops, NetBuilder from caffe2.proto import caffe2_pb2 import unittest if workspace.has_gpu_support and workspace.NumCudaDevices() > 0: gpu_device_option = caffe2_pb2.DeviceOption() gpu_device_option.device_type = caffe2_pb2.CUDA cpu_device_option = caffe2_pb2.DeviceOption() gpu_device_checker = device_checker.DeviceChecker(0.01, [gpu_device_option]) device_checker = device_checker.DeviceChecker( 0.01, [gpu_device_option, cpu_device_option]) gpu_gradient_checkers = [ gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option, "gpu_checker_ws"), ] gradient_checkers = [ gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option, "gpu_checker_ws"), gradient_checker.GradientChecker(0.01, 0.05, cpu_device_option,
from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import unittest from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace, data_parallel_model, cnn, recurrent from caffe2.python.test_util import TestCase @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.") @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.") class GPUDataParallelModelTest(TestCase): def run_model(self, gpu_devices): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss]
class CopyOpsTest(unittest.TestCase): def tearDown(self): # Reset workspace after each test # Otherwise, the multi-GPU test will use previously created tensors, # which may have been placed on the wrong device workspace.ResetWorkspace() def run_test_copy_gradient(self, device_opt): model = model_helper.ModelHelper(name="copy_test") with core.DeviceScope(device_opt): x = model.net.AddExternalInputs("x") y = model.Copy(x, "y") loss = model.AveragedLoss(y, "loss") gradient_map = model.AddGradientOperators([loss]) workspace.FeedBlob(x, np.random.rand(32).astype(np.float32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) self.assertTrue( np.array_equal( workspace.FetchBlob(x), workspace.FetchBlob(y), )) self.assertTrue( np.array_equal( workspace.FetchBlob(gradient_map[x]), workspace.FetchBlob(gradient_map[y]), )) def test_copy_gradient_cpu(self): self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0)) @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.") def test_copy_gradient_gpu(self): self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CUDA, 0)) @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPU.") def test_copy_gradient_multiple_gpus(self): model = model_helper.ModelHelper(name="copy_test") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): x_cpu = model.net.AddExternalInputs("x_cpu") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)): x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2") loss = model.AveragedLoss(x_gpu_2, "loss") gradient_map = model.AddGradientOperators([loss]) workspace.FeedBlob("x_cpu", np.random.rand(32).astype(np.float32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) self.assertTrue( np.array_equal( workspace.FetchBlob("x_gpu_1"), workspace.FetchBlob("x_gpu_2"), )) self.assertTrue( np.array_equal( workspace.FetchBlob(gradient_map["x_gpu_1"]), workspace.FetchBlob(gradient_map["x_gpu_2"]), )) def get_op_with_output(model, output_blob_name): for op in model.net.Proto().op: if len(op.output) == 1 and op.output[0] == output_blob_name: return op return None self.assertEqual( get_op_with_output(model, "x_gpu_2_grad").device_option, core.DeviceOption(caffe2_pb2.CUDA, 1), ) self.assertEqual( get_op_with_output(model, "x_cpu_grad").device_option, core.DeviceOption(caffe2_pb2.CUDA, 0), ) @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.") def test_cpu2gpu_gpu2cpu_sparse_gradients(self): model = model_helper.ModelHelper(name="copy_test") v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4]) indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4]) cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0) gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) with core.DeviceScope(gpu_opt): vcpu = model.CopyGPUToCPU(v, "vcpu") with core.DeviceScope(cpu_opt): g = model.Gather([vcpu, indices], "g") with core.DeviceScope(gpu_opt): ggpu = model.CopyCPUToGPU(g, "ggpu") f = brew.fc(model, ggpu, "out", dim_in=4, dim_out=6) (softmax, loss) = model.SoftmaxWithLoss( [f, "label"], ["softmax", "loss"], ) gradient_map = model.AddGradientOperators([loss]) self.assertTrue("v" in gradient_map) self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice)) @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.") def test_cpu2gpu_gpu2cpu_gradients(self): model = model_helper.ModelHelper(name="copy_test") batch = 32 cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0) gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) with core.NameScope("cpu"): with core.DeviceScope(cpu_opt): x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8) with core.NameScope("gpu_0"): with core.DeviceScope(gpu_opt): x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu") pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4) pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu") with core.DeviceScope(cpu_opt): with core.NameScope("cpu"): (softmax, loss) = model.SoftmaxWithLoss( [pred_cpu, "label"], ["softmax", "loss"], ) gradient_map = model.AddGradientOperators([loss]) # Add param updates (for cpu and gpu) init_net = model.param_init_net with core.DeviceScope(cpu_opt): with core.NameScope("cpu"): ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0) for param in model.GetParams(): model.WeightedSum( [param, ONE, gradient_map[param], LR], param, ) with core.NameScope("gpu_0"): with core.DeviceScope(gpu_opt): ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0) for param in model.GetParams(): model.WeightedSum( [param, ONE, gradient_map[param], LR], param, ) with core.DeviceScope(cpu_opt): workspace.FeedBlob( 'cpu/data', np.random.rand(batch, 16).astype(np.float32), ) workspace.FeedBlob( 'cpu/label', np.random.randint(4, size=batch).astype(np.int32), ) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()} workspace.RunNet(model.net.Proto().name) updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()} for p in model.GetParams(): g = gradient_map[p] expected = initial_params[p] - 2.0 * workspace.FetchBlob(g) actual = updated_params[p] self.assertTrue( np.array_equal(expected, updated_params[p]), "Mismatch: {}: {}, {}".format(p, expected, actual), )
for scoped_name, blob in restored_all_params.items(): unscoped_name = c2_utils.UnscopeName(scoped_name) np.testing.assert_array_equal(blob, orig_gpu_0_params[unscoped_name]) if __name__ == '__main__': workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) logger = utils.logging.setup_logging(__name__) logger.setLevel(logging.DEBUG) logging.getLogger('roi_data.loader').setLevel(logging.INFO) np.random.seed(cfg.RNG_SEED) output_dir = tempfile.mkdtemp() # Generate config for test cfg.MODEL.TYPE = 'generalized_rcnn' cfg.MODEL.CONV_BODY = 'FPN.add_fpn_ResNet50_conv5_body' cfg.MODEL.NUM_CLASSES = 81 cfg.MODEL.FASTER_RCNN = True cfg.FPN.FPN_ON = True cfg.FPN.MULTILEVEL_ROIS = True cfg.FPN.MULTILEVEL_RPN = True cfg.FAST_RCNN.ROI_BOX_HEAD = 'fast_rcnn_heads.add_roi_2mlp_head' cfg.FAST_RCNN.ROI_XFORM_METHOD = 'RoIAlign' cfg.OUTPUT_DIR = output_dir cfg.TRAIN.DATASETS = ('coco_2014_minival',) cfg.TRAIN.WEIGHTS = b'' for num_gpu in range(workspace.NumCudaDevices()): cfg.immutable(False) cfg.NUM_GPUS = num_gpu + 1 assert_and_infer_cfg() test_restore_checkpoint()
def bmuf_process(filestore_dir, process_id, shared_results, nesterov=False): # We need to import caffe2 in every process to initialize CUDA independently. from caffe2.python import core, cnn, data_parallel_model, workspace, dyndep from caffe2.proto import caffe2_pb2 dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops") if not workspace.has_gpu_support: log.info('No GPU support test is Ignored.') return if workspace.NumCudaDevices() < 4: log.info('Not enough GPU support, test IGNORED') return model = cnn.CNNModelHelper(order="NHWC", name="test") gpu_ids = [0, 1] if process_id == 0 else [2, 3] def _model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def _input_builder_fun(model): return None def _param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) def _generate_data(gpu_devices, process_id): np.random.seed(26 + process_id * 10) # Each run has same input, independent of number of gpus batch_size = 64 for _ in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/data".format(g), data) workspace.FeedBlob("gpu_{}/label".format(g), labels) _generate_data(gpu_ids, process_id) workspace.RunOperatorOnce( core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir)) rendezvous = dict(kv_handler="store_handler", shard_id=process_id, num_shards=2, engine="GLOO", exit_nets=None) data_parallel_model.Parallelize_GPU_BMUF( model, _input_builder_fun, _model_build_fun, _param_update_fun, devices=gpu_ids, rendezvous=rendezvous, nesterov=nesterov, add_blobs_to_sync=["sync_num"], ) data_parallel_model.RunInitNet(model) def _gpu_pid(gpu_id, pid): if pid == 1: return gpu_id + 2 return gpu_id np.testing.assert_equal( workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))), np.zeros(16).astype(np.float32).reshape(1, 16)) # Run the algorithm for one iteration to have non-zero params. data_parallel_model.RunNet(model, 1) # Save iteration momentum and post local update params results = {} v_b_ = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id))) v_w_ = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))) results['v_b_'] = v_b_ results['v_w_'] = v_w_ workspace.RunNetOnce(model.net) b_0_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id))) w_0_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id))) b_1_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id))) w_1_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id))) results['b_0_'] = b_0_ results['w_0_'] = w_0_ results['b_1_'] = b_1_ results['w_1_'] = w_1_ # Test sync if process_id == 0: workspace.FeedBlob(model._device_prefix + "_0/sync_num", np.array([2603]).astype(np.float32), device_option=core.DeviceOption( model._device_type, 0)) # Compute block gradients. b_g_ = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id))) w_g_ = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id))) results['b_g_'] = b_g_ results['w_g_'] = w_g_ workspace.RunNetOnce(model._global_model_param_updates_net) # g_b = (b_0_ + b_1_) / 2 - b_g_ # g_w = (w_0_ + w_1_) / 2 - w_g_ v_b = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id))) v_w = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))) w_g = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id))) b_g = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id))) w_0 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id))) b_0 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id))) w_1 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id))) b_1 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id))) results['v_b'] = v_b results['v_w'] = v_w results['w_g'] = w_g results['b_g'] = b_g results['w_0'] = w_0 results['b_0'] = b_0 results['w_1'] = w_1 results['b_1'] = b_1 # Test add_blobs_to_sync for j in model._devices: sync = workspace.FetchBlob(model._device_prefix + "_{}/sync_num".format(j))[0] results['sync_{}'.format(j)] = sync shared_results[process_id] = results
def Parallelize_GPU( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, devices=range(0, workspace.NumCudaDevices()), rendezvous=None, net_type='dag', broadcast_computed_params=True, ): ''' Function to create a model that can run on many GPUs. model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper input_builder_fun: Function that adds the input operators Note: Remember to instantiate reader outside of this function so all GPUs share same reader object. Signature: input_builder_fun(model) forward_pass_builder_fun: Function to add the operators to the model. Must return list of loss-blob references that are used to build the gradient. Signature: forward_pass_builder_fun(model) param_update_builder_fun: Function that adds operators that are run after gradient update, such as updating the weights and weight decaying. Function is also passed the learning rate scaling factor. You should multiple the learning rate by the factor to maintain invariant of same results with same total batch size, regardless of number of gpus. Signature: param_update_builder_fun(model, lr_scale) devices: List of GPU ids, such as [0, 1, 2, 3], rendezvous: used for rendezvous in distributed computation, if None then only one node is used. To create rendezvous, use <TBD>. net_type: Network type ''' log.info("Parallelizing model for devices: {}".format(devices)) extra_workers = 8 if rendezvous is not None else 0 # best-guess model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers model_helper_obj.net.Proto().type = net_type # Store some information in the model -- a bit ugly model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelperBase) assert model_helper_obj.params == [], "Model needs to be empty" # Add input and model log.info("Create input and model training operators") losses_by_gpu = {} for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): log.info("Model for GPU: {}".format(device)) input_builder_fun(model_helper_obj) losses = forward_pass_builder_fun(model_helper_obj) # Losses are not needed for test net if param_update_builder_fun is not None: assert isinstance(losses, list), \ 'Model builder function must return list of loss blobs' for loss in losses: assert isinstance(loss, core.BlobReference), \ 'Model builder func must return list of loss blobs' losses_by_gpu[device] = losses # Create parameter map model_helper_obj._device_grouped_blobs =\ _GroupByDevice(devices, model_helper_obj.params) # computed params computed_params_grouped =\ _GroupByDevice(devices, model_helper_obj.computed_params) model_helper_obj._device_grouped_blobs.update(computed_params_grouped) model_helper_obj._param_names =\ model_helper_obj._device_grouped_blobs.keys() model_helper_obj._computed_param_names = computed_params_grouped.keys() if (param_update_builder_fun is None): log.info("Parameter update function not defined --> only forward") return log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) # Group gradients by device and register to blob lookup param_to_grad = model_helper_obj.param_to_grad grads_ordered = [param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad] gradients_grouped = _GroupByDevice( devices, grads_ordered, ) model_helper_obj._device_grouped_blobs.update(gradients_grouped) model_helper_obj._grad_names = gradients_grouped.keys() log.info("Add gradient all-reduces for SyncSGD") if broadcast_computed_params: _BroadcastComputedParams(devices, model_helper_obj, rendezvous) _AllReduceGradients( devices, model_helper_obj, rendezvous ) log.info("Post-iteration operators for updating params") num_shards = 1 if rendezvous is None else rendezvous['num_shards'] lr_scale = 1.0 / (len(devices) * num_shards) for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): param_update_builder_fun(model_helper_obj, lr_scale) _AnalyzeOperators(model_helper_obj) # Add initial parameter syncs log.info("Add initial parameter sync") if (rendezvous is not None): _AddDistributedParameterSync( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj.param_init_net, rendezvous, ) _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)
def testAllreduceSingleGPU(self): for i in range(workspace.NumCudaDevices()): self.RunningAllreduceWithGPUs([i], muji.Allreduce)
class NCCLOpsTest(hu.HypothesisTestCase): @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()), m=st.integers(min_value=1, max_value=1000), in_place=st.booleans()) def test_nccl_allreduce(self, n, m, in_place): xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] op = core.CreateOperator("NCCLAllreduce", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def allreduce(*args): assert len(args) == n output = np.sum(args, axis=0) return [output for _ in range(n)] outputs = self.assertReferenceChecks( hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allreduce, input_device_options) for output in outputs: np.testing.assert_array_equal(outputs[0], output) self.assertEqual(outputs[0].tobytes(), output.tobytes()) @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()), m=st.integers(min_value=1, max_value=1000), root=st.integers(min_value=0, max_value=workspace.NumCudaDevices() - 1)) def test_nccl_broadcast(self, n, m, root): assume(root < n) xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def broadcast(*args): assert len(args) == n return [args[root] for _ in range(n)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], broadcast, input_device_options) @given( n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()), m=st.integers(min_value=1, max_value=1000), # NCCL Reduce seems to deadlock for non-zero roots. root=st.integers(min_value=0, max_value=0), in_place=st.booleans()) def test_nccl_reduce(self, n, m, root, in_place): assume(in_place is False or root == 0) xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLReduce", inputs, inputs[root] if in_place else b"o", root=root) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def reduce(*args): assert len(args) == n return [np.sum(args, axis=0)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], reduce, input_device_options) @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()), m=st.integers(min_value=1, max_value=1000)) def test_nccl_allgather(self, n, m): xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] outputs = [str("o_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLAllGather", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def allgather(*args): assert len(args) == n return [np.stack(args, axis=0) for _ in range(n)] outputs = self.assertReferenceChecks( hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allgather, input_device_options) for output in outputs: np.testing.assert_array_equal(outputs[0], output) self.assertEqual(outputs[0].tobytes(), output.tobytes()) @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()), m=st.integers(min_value=1, max_value=1000)) def test_nccl_reduce_scatter(self, n, m): xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] outputs = [str("o_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLReduceScatter", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def reduce_scatter(*args): assert len(args) == n reduced = sum(args) assert len(reduced.shape) > 1 ref = [reduced[i, :] for i in range(n)] return ref self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], reduce_scatter, input_device_options) @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()), m=st.integers(min_value=100000, max_value=100000), iters=st.integers(min_value=1, max_value=100), net_type=st.sampled_from(["dag", "async_dag", "simple"])) def _test_nccl_sync(self, n, m, iters, net_type): inputs = [str("x_{}".format(i)) for i in range(n)] extra_inputs = [str("xe_{}".format(i)) for i in range(n)] net = core.Net("asdf") net.Proto().type = net_type net.Proto().num_workers = n for i in range(n): net.ConstantFill([], inputs[i], shape=[m], value=0.0, device_option=gpu_device(i)) net.ConstantFill([], extra_inputs[i], shape=[m], value=1.0, device_option=gpu_device(i)) for _ in range(iters): net.Sum([inputs[i], extra_inputs[i]], [inputs[i]], device_option=gpu_device(i)) net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0)) self.ws.run(net) np.testing.assert_array_equal( self.ws.blobs[inputs[0]].fetch(), np.full(shape=(m, ), fill_value=iters * n, dtype=np.float32)) @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark") def test_timings(self): for n in range(2, workspace.NumCudaDevices()): for in_place in [False, True]: xs = [ np.random.randn(1e7).astype(np.float32) for i in range(n) ] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i)) self.ws.run(net) net_time = benchmark(self.ws, net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(self.ws, vanilla) print("Speedup for NCCL: {:.2f}".format(vanilla_time / net_time))
def Parallelize_GPU( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, devices=range(0, workspace.NumCudaDevices()), rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False, ): ''' Function to create a model that can run on many GPUs. model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper input_builder_fun: Function that adds the input operators Note: Remember to instantiate reader outside of this function so all GPUs share same reader object. Signature: input_builder_fun(model) forward_pass_builder_fun: Function to add the operators to the model. Must return list of loss-blob references that are used to build the gradient. Loss scale parameter is passed, as you should scale the loss of your model by 1.0 / the total number of gpus. Signature: forward_pass_builder_fun(model, loss_scale) param_update_builder_fun: Function that adds operators that are run after gradient update, such as updating the weights and weight decaying. Signature: param_update_builder_fun(model) devices: List of GPU ids, such as [0, 1, 2, 3], rendezvous: used for rendezvous in distributed computation, if None then only one node is used. To create rendezvous, use <TBD>. net_type: Network type ''' log.info("Parallelizing model for devices: {}".format(devices)) extra_workers = 8 if rendezvous is not None else 0 # best-guess model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers model_helper_obj.net.Proto().type = net_type # Store some information in the model -- a bit ugly model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelperBase) # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) # Add input and model log.info("Create input and model training operators") losses_by_gpu = {} num_shards = 1 if rendezvous is None else rendezvous['num_shards'] loss_scale = 1.0 / (len(devices) * num_shards) for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): log.info("Model for GPU: {}".format(device)) input_builder_fun(model_helper_obj) losses = forward_pass_builder_fun(model_helper_obj, loss_scale) # Losses are not needed for test net if param_update_builder_fun is not None: assert isinstance(losses, list), \ 'Model builder function must return list of loss blobs' for loss in losses: assert isinstance(loss, core.BlobReference), \ 'Model builder func must return list of loss blobs' losses_by_gpu[device] = losses _ValidateParams(model_helper_obj.params) # Create parameter map model_helper_obj._device_grouped_blobs =\ _GroupByDevice(devices, model_helper_obj.params, non_datapar_params) # computed params computed_params_grouped =\ _GroupByDevice(devices, model_helper_obj.computed_params, []) model_helper_obj._device_grouped_blobs.update(computed_params_grouped) model_helper_obj._param_names =\ model_helper_obj._device_grouped_blobs.keys() model_helper_obj._computed_param_names = computed_params_grouped.keys() if (param_update_builder_fun is None): log.info("Parameter update function not defined --> only forward") _InferBlobDevice(model_helper_obj) return log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) _ValidateParams(model_helper_obj.params) # Group gradients by device and register to blob lookup param_to_grad = model_helper_obj.param_to_grad grads_ordered = [ param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad ] non_datapar_grads = [param_to_grad[p] for p in non_datapar_params] gradients_grouped = _GroupByDevice(devices, grads_ordered, non_datapar_grads) model_helper_obj._device_grouped_blobs.update(gradients_grouped) model_helper_obj._grad_names = gradients_grouped.keys() _InferBlobDevice(model_helper_obj) log.info("Add gradient all-reduces for SyncSGD") if broadcast_computed_params: _BroadcastComputedParams(devices, model_helper_obj, rendezvous) _AllReduceGradients(devices, model_helper_obj, rendezvous) log.info("Post-iteration operators for updating params") num_shards = 1 if rendezvous is None else rendezvous['num_shards'] # The following check is necessary for ring reduce to work if rendezvous is not None: assert num_shards > 1, \ "Please use more than one shard for distributed training" for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): param_update_builder_fun(model_helper_obj) _InferBlobDevice(model_helper_obj) _AnalyzeOperators(model_helper_obj) # Configure dagnet to run with only one worker on the first iteration, # to prevent concurrency problems with allocs and nccl. arg = model_helper_obj.Proto().arg.add() arg.name = "first_iter_only_one_worker" arg.i = 1 # Add initial parameter syncs log.info("Add initial parameter sync") if (rendezvous is not None): _AddDistributedParameterSync( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj.param_init_net, rendezvous, ) _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net) if optimize_gradient_memory: _OptimizeGradientMemory(model_helper_obj, losses_by_gpu, devices)
def testGetCudaPeerAccessPattern(self): pattern = workspace.GetCudaPeerAccessPattern() self.assertEqual(type(pattern), np.ndarray) self.assertEqual(pattern.ndim, 2) self.assertEqual(pattern.shape[0], pattern.shape[1]) self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
max_dim=4, dtype=np.float32, elements=None, **kwargs): dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim) return dims_.flatmap(lambda dims: st.lists( arrays(dims, dtype, elements), min_size=n, max_size=n)) cpu_do = caffe2_pb2.DeviceOption() gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA) device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) # Include device option for each GPU expanded_device_options = [cpu_do] + ([ caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i) for i in range(workspace.NumCudaDevices()) ] if workspace.has_gpu_support else []) def device_checker_device_options(): return st.just(device_options) def gradient_checker_device_option(): return st.sampled_from(device_options) gcs = dict(gc=gradient_checker_device_option(), dc=device_checker_device_options()) gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
# debugging and profiling parser.add_argument("--print-freq", type=int, default=1) parser.add_argument("--print-time", action="store_true", default=False) parser.add_argument("--debug-mode", action="store_true", default=False) parser.add_argument("--enable-profiling", action="store_true", default=False) parser.add_argument("--plot-compute-graph", action="store_true", default=False) args = parser.parse_args() ### some basic setup ### np.random.seed(args.numpy_rand_seed) np.set_printoptions(precision=args.print_precision) use_gpu = args.use_gpu if use_gpu: device_opt = core.DeviceOption(workspace.GpuDeviceType, 0) ngpus = workspace.NumCudaDevices() # 1 print("Using {} GPU(s)...".format(ngpus)) else: device_opt = core.DeviceOption(caffe2_pb2.CPU) print("Using CPU...") ### prepare training data ### ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") if args.data_generation == "dataset": # input and target data (nbatches, lX, lS_l, lS_i, lT, nbatches_test, lX_test, lS_l_test, lS_i_test, lT_test, ln_emb, m_den) = dc.read_dataset( args.data_set, args.mini_batch_size, args.data_randomize, args.num_batches, True, args.raw_data_file, args.processed_data_file) ln_bot[0] = m_den
def Parallelize_GPU( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun=None, optimizer_builder_fun=None, post_sync_builder_fun=None, devices=range(0, workspace.NumCudaDevices()), rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False, use_nccl=False, max_concurrent_distributed_ops=4, ): ''' Function to create a model that can run on many GPUs. model_helper_obj: an object of ModelHelper, such as CNNModelHelper input_builder_fun: Function that adds the input operators Note: Remember to instantiate reader outside of this function so all GPUs share same reader object. Signature: input_builder_fun(model) forward_pass_builder_fun: Function to add the operators to the model. Must return list of loss-blob references that are used to build the gradient. Loss scale parameter is passed, as you should scale the loss of your model by 1.0 / the total number of gpus. Signature: forward_pass_builder_fun(model, loss_scale) param_update_builder_fun: Function that adds operators that are run after gradient update, such as updating the weights and weight decaying. This is called for each GPU separately. Signature: param_update_builder_fun(model) optimizer_builder_fun: Alternative to param_update_builder_fun, allows one to add an optimizer for the whole model. Called only once, without name or devicescope. post_sync_builder_fun: Function applied after initial parameter sync has been completed, such as keeping multi-precision parameters in sync. Signature: post_sync_builder_fun(model) devices: List of GPU ids, such as [0, 1, 2, 3], rendezvous: used for rendezvous in distributed computation, if None then only one node is used. To create rendezvous, use <TBD>. net_type: Network type optimize_gradient_memory: whether to apply 'memonger' to share blobs in gradient computation to reduce memory footprint ''' log.info("Parallelizing model for devices: {}".format(devices)) extra_workers = 8 if rendezvous is not None else 0 # best-guess num_workers = len(devices) * 4 + extra_workers max_concurrent_distributed_ops =\ min(max_concurrent_distributed_ops, num_workers - 1) model_helper_obj.net.Proto().num_workers = num_workers model_helper_obj.net.Proto().type = net_type # Store some information in the model -- a bit ugly model_helper_obj._devices = devices model_helper_obj._rendezvous = rendezvous model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelper) # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) # Add input and model log.info("Create input and model training operators") losses_by_gpu = {} num_shards = 1 if rendezvous is None else rendezvous['num_shards'] loss_scale = 1.0 / (len(devices) * num_shards) has_parameter_updates = param_update_builder_fun is not None or \ optimizer_builder_fun is not None assert not ( param_update_builder_fun is not None and optimizer_builder_fun is not None ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun' for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): log.info("Model for GPU: {}".format(device)) input_builder_fun(model_helper_obj) losses = forward_pass_builder_fun(model_helper_obj, loss_scale) # Losses are not needed for test net if has_parameter_updates: assert isinstance(losses, list), \ 'Model builder function must return list of loss blobs' for loss in losses: assert isinstance(loss, core.BlobReference), \ 'Model builder func must return list of loss blobs' losses_by_gpu[device] = losses _ValidateParams(model_helper_obj.params) # Create parameter map model_helper_obj._device_grouped_blobs =\ _GroupByDevice(devices, model_helper_obj.params, non_datapar_params) # computed params computed_params_grouped =\ _GroupByDevice(devices, model_helper_obj.GetComputedParams(''), []) model_helper_obj._device_grouped_blobs.update(computed_params_grouped) model_helper_obj._param_names =\ model_helper_obj._device_grouped_blobs.keys() model_helper_obj._computed_param_names = computed_params_grouped.keys() if not has_parameter_updates: log.info("Parameter update function not defined --> only forward") _InferBlobDevice(model_helper_obj) return log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) _ValidateParams(model_helper_obj.params) # Group gradients by device and register to blob lookup param_to_grad = model_helper_obj.param_to_grad grads_ordered = [param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad] non_datapar_grads = [param_to_grad[p] for p in non_datapar_params] gradients_grouped = _GroupByDevice( devices, grads_ordered, non_datapar_grads ) model_helper_obj._device_grouped_blobs.update(gradients_grouped) model_helper_obj._grad_names = gradients_grouped.keys() model_helper_obj._losses_by_gpu = losses_by_gpu _InferBlobDevice(model_helper_obj) log.info("Add gradient all-reduces for SyncSGD") if broadcast_computed_params: _BroadcastComputedParams(devices, model_helper_obj, rendezvous, use_nccl) if len(model_helper_obj._grad_names) > 0: _AllReduceGradients( devices, model_helper_obj, rendezvous, use_nccl, max_concurrent_distributed_ops, ) else: log.info("NOTE: Param builder function did not create any parameters.") log.info("Post-iteration operators for updating params") num_shards = 1 if rendezvous is None else rendezvous['num_shards'] if param_update_builder_fun is not None: for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): param_update_builder_fun(model_helper_obj) else: log.info("Calling optimizer builder function") optimizer_builder_fun(model_helper_obj) (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj) sync_blobs_grouped = _GroupByDevice( devices, sync_blobs, [], ) model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped) _InferBlobDevice(model_helper_obj) _AnalyzeOperators(model_helper_obj) # Configure dagnet to run with only one worker on the first iteration, # to prevent concurrency problems with allocs and nccl. arg = model_helper_obj.Proto().arg.add() arg.name = "first_iter_only_one_worker" arg.i = 1 # Add initial parameter syncs log.info("Add initial parameter sync") if (rendezvous is not None and num_shards > 1): _AddDistributedParameterSync( devices, model_helper_obj, model_helper_obj.param_init_net, model_helper_obj.param_init_net, rendezvous, sync_names, ) _SyncParams( devices, model_helper_obj, model_helper_obj.param_init_net, sync_names ) # Handle any operations that need to be done after parameter sync # i.e. making sure multi-precision copies of parameters are up-to-date if post_sync_builder_fun is not None: for device in devices: device_opt = core.DeviceOption(caffe2_pb2.CUDA, device) with core.DeviceScope(device_opt): with core.NameScope("gpu_{}".format(device)): post_sync_builder_fun(model_helper_obj) if optimize_gradient_memory: _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices) model_helper_obj._data_parallel_model_init_nets = [ model_helper_obj.param_init_net, ] model_helper_obj._data_parallel_model_nets = [model_helper_obj.net]
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name="ban-pc-resnet50", arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model configs for constructing model with open(args.model_config) as f: model_config = yaml.load(f) # Model building functions def create_target_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = add_se_model(model, model_config, "data", is_test=False) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') loss = add_softmax_loss(model, pred, 'label') brew.accuracy(model, ['softmax', 'label'], 'accuracy') return [loss] def add_optimizer(model): ''' stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) ''' optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, base_learning_rate = args.base_learning_rate, momentum = model_config['solver']['momentum'], nesterov = model_config['solver']['nesterov'], policy = model_config['solver']['lr_policy'], power = model_config['solver']['power'], max_iter = model_config['solver']['max_iter'], ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_target_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) if args.model_parallel: # Shift half of the activations to another GPU assert workspace.NumCudaDevices() >= 2 * args.num_gpus activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name="ban-pc-resnet50_test", arg_scope=test_arg_scope, init_params=False ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_target_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "log/{}/resnet50_gpu{}_b{}_L{}_lr{:.2f}_v2".format( args.dataset_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Load pretrained param_init_net load_init_net_multigpu(args) # Run the training one epoch a time best_accuracy = 0 while epoch < args.num_epochs: epoch, best_accuracy = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, best_accuracy, ) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % ( args.file_store_path, args.save_model_name ) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Parallelize_GPU_BMUF( model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, block_learning_rate=1.0, block_momentum=None, devices=range(0, workspace.NumCudaDevices()), net_type='dag', master_gpu=None, optimize_gradient_memory=False, reset_momentum_sgd=False ): ''' Function to create model that run on many GPUs and creates a net for parameter_updates that can be run independently for number of iterations then followed by another net that runs once to compute the final parameter updates according to block wise model update filtering rule described in : Scalable Training of Deep Learning Machines by Incremental Block Training with Intra-block Parallel Optimization and Blockwise Model-Update Filtering (ICASSP 2016). ''' assert isinstance(model_helper_obj, model_helper.ModelHelper) if master_gpu is None: master_gpu = devices[0] model_helper_obj._devices = devices master_gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, master_gpu) num_workers = len(devices) num_worker_threads = 4 * len(devices) loss_scale = 1.0 / num_workers if block_momentum is None: block_momentum = 1.0 - 1.0 / num_workers model_helper_obj.net.Proto().num_workers = num_worker_threads model_helper_obj.net.Proto().type = net_type # A net for initializing global model parameters. Its called once in the # same step as net parameters initialization. model_helper_obj._global_model_init_net = core.Net('global_model_init') model_helper_obj._global_model_init_net.Proto().type = net_type model_helper_obj._global_model_init_net.Proto().num_workers = \ num_worker_threads # A net for computing final parameter updates. Its will run once after # running net (local models updates) for `num_local_iterations` times. model_helper_obj._global_model_param_updates_net = core.Net('global_model') model_helper_obj._global_model_param_updates_net.Proto().type = net_type model_helper_obj._global_model_param_updates_net.Proto().num_workers = \ num_worker_threads def _v(param): return "{}_v".format(param) def _g(param): return "{}_g".format(param) # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) model_helper_obj._losses_by_gpu = {} def _InitializeModels(gpu_id): input_builder_fun(model_helper_obj) loss = forward_pass_builder_fun(model_helper_obj, loss_scale) model_helper_obj._losses_by_gpu[gpu_id] = loss _ForEachGPU(devices, _InitializeModels, scoped=True) model_helper_obj._device_grouped_blobs =\ _GroupByDevice(devices, model_helper_obj.params, non_datapar_params) model_helper_obj._param_names =\ model_helper_obj._device_grouped_blobs.keys() _AddGradientOperators( devices, model_helper_obj, model_helper_obj._losses_by_gpu ) _InferBlobDevice(model_helper_obj) def _InitializeParamUpdate(gpu_id): param_update_builder_fun(model_helper_obj) _ForEachGPU(devices, _InitializeParamUpdate, scoped=True) # (Step-0) Initialize momentum parameters on master GPU. for param_name in model_helper_obj._device_grouped_blobs.keys(): param = model_helper_obj._device_grouped_blobs[param_name][master_gpu] with core.DeviceScope(master_gpu_opt): model_helper_obj._global_model_init_net.ConstantFill( param, _v(param), value=0.0 ) model_helper_obj._global_model_init_net.Copy(param, _g(param)) # (Step-1) Update models for num_local_iterations. # (Step-2) Comute post-local-updates average of the params. # Sum model params across GPUs and store resutls in param_avg blob. for param_name in model_helper_obj._device_grouped_blobs.keys(): with core.DeviceScope(master_gpu_opt): _AllReduce( devices, model_helper_obj, model_helper_obj._global_model_param_updates_net, param_name ) # (Step-3) Update momentum params : # param_v = block_momentum * param_v # + block_learning_Rate * (param_avg - param) # param = param + param_v for param_name in model_helper_obj._device_grouped_blobs.keys(): param = model_helper_obj._device_grouped_blobs[param_name][master_gpu] with core.DeviceScope(master_gpu_opt): # TODO(ataei) : Stop building the graph here to get model average ? model_helper_obj._global_model_param_updates_net.Scale( param, param, scale=1.0 / num_workers ) model_helper_obj._global_model_param_updates_net.Sub( [param, _g(param)], param ) model_helper_obj._global_model_param_updates_net.Scale( param, param, scale=block_learning_rate ) model_helper_obj._global_model_param_updates_net.Scale( _v(param), _v(param), scale=block_momentum ) model_helper_obj._global_model_param_updates_net.Add( [_v(param), param], _v(param) ) model_helper_obj._global_model_param_updates_net.Add( [_g(param), _v(param)], _g(param) ) model_helper_obj._global_model_param_updates_net.Copy( _g(param), param ) _Broadcast( devices, model_helper_obj, model_helper_obj._global_model_param_updates_net, param_name ) # Reset momentum-SGD parameters if reset_momentum_sgd: momentum_ops = [op for op in model_helper_obj.net.Proto().op if op.type == 'MomentumSGDUpdate'] for op in momentum_ops: momentum_blob = op.input[1] with core.DeviceScope(op.device_option): model_helper_obj._global_model_param_updates_net.ConstantFill( [momentum_blob], momentum_blob, value=0.0 ) if optimize_gradient_memory: _OptimizeGradientMemorySimple( model_helper_obj, model_helper_obj._losses_by_gpu, devices ) model_helper_obj._data_parallel_model_init_nets = [ model_helper_obj.param_init_net, model_helper_obj._global_model_init_net ] model_helper_obj._data_parallel_model_nets = [ model_helper_obj.net, (model_helper_obj._global_model_param_updates_net, 1) ]