def test_timings(self): for n in range(2, workspace.NumberOfGPUs()): for in_place in [False, True]: xs = [ np.random.randn(1e7).astype(np.float32) for i in range(n) ] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): workspace.FeedBlob(inputs[i], xs[i], gpu_device(i).SerializeToString()) workspace.RunNetOnce(net.Proto().SerializeToString()) net_time = benchmark(net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(vanilla) print("Speedup for NCCL: {:.2f}".format(vanilla_time / net_time))
import numpy as np from caffe2.python import core, device_checker, gradient_checker, workspace from caffe2.proto import caffe2_pb2, caffe2_legacy_pb2 import sys import unittest if workspace.has_gpu_support and workspace.NumberOfGPUs() > 0: gpu_device_option = caffe2_pb2.DeviceOption() gpu_device_option.device_type = caffe2_pb2.CUDA cpu_device_option = caffe2_pb2.DeviceOption() device_checker = device_checker.DeviceChecker( 0.01, [gpu_device_option, cpu_device_option]) gradient_checkers = [ gradient_checker.GradientChecker( 0.005, 0.05, gpu_device_option, "gpu_checker_ws"), gradient_checker.GradientChecker( 0.01, 0.05, cpu_device_option, "cpu_checker_ws"), ] else: cpu_device_option = caffe2_pb2.DeviceOption() device_checker = device_checker.DeviceChecker( 0.01, [cpu_device_option]) gradient_checkers = [ gradient_checker.GradientChecker( 0.01, 0.05, cpu_device_option, "cpu_checker_ws") ] class TestConvLegacyPooling(unittest.TestCase): def setUp(self):
def testAllreduceSingleGPU(self): for i in range(workspace.NumberOfGPUs()): self.RunningAllreduceWithGPUs([i], muji.Allreduce)
def testAllreduceFallback(self): self.RunningAllreduceWithGPUs(range(workspace.NumberOfGPUs()), muji.AllreduceFallback)
class NCCLOpsTest(hu.HypothesisTestCase): @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()), m=st.integers(min_value=1, max_value=1000), in_place=st.booleans()) def test_nccl_allreduce(self, n, m, in_place): xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] op = core.CreateOperator("NCCLAllreduce", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def allreduce(*args): assert len(args) == n output = np.sum(args, axis=0) return [output for _ in range(n)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allreduce, input_device_options) @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()), m=st.integers(min_value=1, max_value=1000), root=st.integers(min_value=0, max_value=workspace.NumberOfGPUs() - 1)) def test_nccl_broadcast(self, n, m, root): assume(root < n) xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def broadcast(*args): assert len(args) == n return [args[root] for _ in range(n)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], broadcast, input_device_options) @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()), m=st.integers(min_value=1, max_value=1000), root=st.integers(min_value=0, max_value=workspace.NumberOfGPUs() - 1), in_place=st.booleans()) def test_nccl_reduce(self, n, m, root, in_place): assume(root < n) assume(in_place is False or root == 0) xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLReduce", inputs, inputs[root] if in_place else b"o", root=root) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def reduce(*args): assert len(args) == n return [np.sum(args, axis=0)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], reduce, input_device_options) @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()), m=st.integers(min_value=1, max_value=1000)) def test_nccl_allgather(self, n, m): xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] outputs = [str("o_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLAllGather", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def allgather(*args): assert len(args) == n return [np.stack(args, axis=0) for _ in range(n)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allgather, input_device_options) @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark") def test_timings(self): for n in range(2, workspace.NumberOfGPUs()): for in_place in [False, True]: xs = [ np.random.randn(1e7).astype(np.float32) for i in range(n) ] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): workspace.FeedBlob(inputs[i], xs[i], gpu_device(i).SerializeToString()) workspace.RunNetOnce(net.Proto().SerializeToString()) net_time = benchmark(net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(vanilla) print("Speedup for NCCL: {:.2f}".format(vanilla_time / net_time))
def testGetCudaPeerAccessPattern(self): pattern = workspace.GetCudaPeerAccessPattern() self.assertEqual(type(pattern), np.ndarray) self.assertEqual(pattern.ndim, 2) self.assertEqual(pattern.shape[0], pattern.shape[1]) self.assertEqual(pattern.shape[0], workspace.NumberOfGPUs())
import numpy as np import unittest from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.has_gpu_support or workspace.NumberOfGPUs() == 0, "No gpu support.") class TestWorkspaceGPU(test_util.TestCase): def setUp(self): workspace.ResetWorkspace() self.net = core.Net("test-net") self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0) self.net.RunAllOnGPU() def testFetchBlobGPU(self): self.assertEqual( workspace.RunNetOnce(self.net.Proto().SerializeToString()), True) fetched = workspace.FetchBlob("testblob") # check if fetched is correct. self.assertEqual(fetched.shape, (1, 2, 3, 4)) np.testing.assert_array_equal(fetched, 1.0) fetched[:] = 2.0 self.assertEqual(workspace.FeedBlob("testblob", fetched), True) fetched_again = workspace.FetchBlob("testblob") self.assertEqual(fetched_again.shape, (1, 2, 3, 4)) np.testing.assert_array_equal(fetched_again, 2.0) def testDefaultGPUID(self): self.assertEqual(workspace.SetDefaultGPUID(0), True)
inputs["data"] = np.random.rand(4, 227, 227, 3).astype(np.float32) inputs["label"] = np.array([1, 2, 3, 4]).astype(np.int32) cpu_device = caffe2_pb2.DeviceOption() cpu_device.device_type = caffe2_pb2.CPU gpu_device = caffe2_pb2.DeviceOption() gpu_device.device_type = caffe2_pb2.CUDA checker = device_checker.DeviceChecker(1e-2, [cpu_device, gpu_device]) ret = checker.CheckNet( model.net.Proto(), inputs, # The indices sometimes may be sensitive to small numerical # differences in the input, so we ignore checking them. ignore=['_pool1_idx', '_pool2_idx', '_pool5_idx'] ) self.assertEqual(ret, True) def testMiniAlexNet(self): self._testMiniAlexNet("NCHW") self._testMiniAlexNet("NHWC") if __name__ == '__main__': if not workspace.has_gpu_support: print('No GPU support. Skipping gpu test.') elif workspace.NumberOfGPUs() == 0: print('No GPU device. Skipping gpu test.') else: unittest.main()