def load_save(self, src_device_type, src_gpu_id, dst_device_type, dst_gpu_id): workspace.ResetWorkspace() dtypes = [ np.float16, np.float32, np.float64, np.bool, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16 ] arrays = [ np.random.permutation(6).reshape(2, 3).astype(T) for T in dtypes ] assume(core.IsGPUDeviceType(src_device_type) or src_gpu_id == 0) assume(core.IsGPUDeviceType(dst_device_type) or dst_gpu_id == 0) src_device_option = core.DeviceOption(src_device_type, src_gpu_id) dst_device_option = core.DeviceOption(dst_device_type, dst_gpu_id) for i, arr in enumerate(arrays): self.assertTrue(workspace.FeedBlob(str(i), arr, src_device_option)) self.assertTrue(workspace.HasBlob(str(i))) # Saves the blobs to a local db. tmp_folder = self.make_tempdir() op = core.CreateOperator("Save", [str(i) for i in range(len(arrays))], [], absolute_path=1, db=str(tmp_folder / "db"), db_type=self._db_type) self.assertTrue(workspace.RunOperatorOnce(op)) # Reset the workspace so that anything we load is surely loaded # from the serialized proto. workspace.ResetWorkspace() self.assertEqual(len(workspace.Blobs()), 0) def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll): """A helper subfunction to test keep and not keep.""" op = core.CreateOperator("Load", [], blobs, absolute_path=1, db=str(tmp_folder / "db"), db_type=self._db_type, device_option=dst_device_option, keep_device=keep_device, load_all=loadAll) self.assertTrue(workspace.RunOperatorOnce(op)) for i, arr in enumerate(arrays): self.assertTrue(workspace.HasBlob(str(i))) fetched = workspace.FetchBlob(str(i)) self.assertEqual(fetched.dtype, arr.dtype) np.testing.assert_array_equal(workspace.FetchBlob(str(i)), arr) proto = caffe2_pb2.BlobProto() proto.ParseFromString(workspace.SerializeBlob(str(i))) self.assertTrue(proto.HasField('tensor')) self.assertEqual(proto.tensor.device_detail.device_type, device_type) if core.IsGPUDeviceType(device_type): self.assertEqual(proto.tensor.device_detail.device_id, gpu_id) blobs = [str(i) for i in range(len(arrays))] # Load using device option stored in the proto, i.e. # src_device_option _LoadTest(1, src_device_type, src_gpu_id, blobs, 0) # Load again, but this time load into dst_device_option. _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0) # Load back to the src_device_option to see if both paths are able # to reallocate memory. _LoadTest(1, src_device_type, src_gpu_id, blobs, 0) # Reset the workspace, and load directly into the dst_device_option. workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0) # Test load all which loads all blobs in the db into the workspace. workspace.ResetWorkspace() _LoadTest(1, src_device_type, src_gpu_id, [], 1) # Load again making sure that overwrite functionality works. _LoadTest(1, src_device_type, src_gpu_id, [], 1) # Load again with different device. _LoadTest(0, dst_device_type, dst_gpu_id, [], 1) workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, [], 1) workspace.ResetWorkspace() _LoadTest(1, src_device_type, src_gpu_id, blobs, 1) workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 1)
def test_sum_reduce_fp16(self, gc, dc): assume(core.IsGPUDeviceType(gc.device_type)) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(4, 5).astype(np.float16) op = core.CreateOperator( "SumReduceLike", ["X", "Y"], "out", broadcast=1, device_option=gc) def ref_op(X, Y): res = np.sum(X, axis=0) res = np.sum(res, axis=0) return [res] self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3) # Set broadcast and no axis, i.e. broadcasting last dimensions. X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(2, 3).astype(np.float16) op = core.CreateOperator( "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0) def ref_op(X, Y): res = np.sum(X, axis=3) res = np.sum(res, axis=2) return [res] self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3) # broadcasting intermediate dimensions X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(3, 4).astype(np.float16) op = core.CreateOperator( "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1) def ref_op(X, Y): res = np.sum(X, axis=0) res = np.sum(res, axis=2) return [res] self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3) # broadcasting with single elem dimensions at both ends X = np.random.rand(2, 3, 4, 5).astype(np.float16) Y = np.random.rand(1, 3, 4, 1).astype(np.float16) op = core.CreateOperator( "SumReduceLike", ["X", "Y"], "out", broadcast=1) def ref_op(X, Y): res = np.sum(X, axis=0) res = np.sum(res, axis=2) return [res.reshape(Y.shape)] self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, Y], reference=ref_op, threshold=1e-3)
def _run_test(self, n, m, k, transposed, multi_dim, dtype, engine, gc, dc): if dtype == np.float16: # fp16 only supported with CUDA/HIP assume(core.IsGPUDeviceType(gc.device_type)) dc = [d for d in dc if core.IsGPUDeviceType(d.device_type)] if engine == 'TENSORCORE': # TensorCore only makes sense with CUDA assume(gc.device_type == caffe2_pb2.CUDA) # ensures TensorCore kernels can be called m *= 8 k *= 8 n *= 8 X = np.random.rand(m, k).astype(dtype) - 0.5 if multi_dim: if transposed: W = np.random.rand(k, n, 1, 1).astype(dtype) - 0.5 else: W = np.random.rand(n, k, 1, 1).astype(dtype) - 0.5 else: if transposed: W = np.random.rand(k, n).astype(dtype) - 0.5 else: W = np.random.rand(n, k).astype(dtype) - 0.5 b = np.random.rand(n).astype(dtype) - 0.5 def fc_op(X, W, b): return [np.dot(X, W.reshape(n, k).transpose()) + b.reshape(n)] def fc_tranposed_op(X, W, b): return [np.dot(X, W.reshape(k, n)) + b.reshape(n)] op = core.CreateOperator( 'FCTransposed' if transposed else 'FC', ['X', 'W', 'b'], 'out', engine=engine, ) if dtype == np.float16 and core.IsGPUDeviceType(gc.device_type): a = caffe2_pb2.Argument() a.i = 1 a.name = "float16_compute" op.arg.extend([a]) # Check against numpy reference # ReferenceChecks is flaky on rocm with threshold of 1e-4 for fp16. Relaxing to 1e-3. threshold = 1e-3 if (gc.device_type == caffe2_pb2.HIP and dtype == np.float16) else 1e-4 self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, W, b], reference=fc_tranposed_op if transposed else fc_op, threshold=threshold) # Check over multiple devices self.assertDeviceChecks(dc, op, [X, W, b], [0]) # Gradient checks threshold = 0.5 if dtype == np.float16 else 0.005 stepsize = 0.5 if dtype == np.float16 else 0.05 for i in range(3): self.assertGradientChecks(gc, op, [X, W, b], i, [0], threshold=threshold, stepsize=stepsize)