def relu_test(self, inputs, gc, dc): X = inputs[0] # First dimension is the batch size print(X.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom(core.CreateOperator("Relu", ["X"], ["Y"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["X"]) pred_net_ref.external_output.append("Y_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "ReluFakeFp16", ["X"], ["Y_ref"], )) shape_hints = {"X": X.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.FeedBlob("X", X) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2, Y_glow): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info("Relu", { "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)
def _test_unary_op(self, opname): workspace.ResetWorkspace() n = 1 m = 10000 X = np.linspace(-20, 20, num=m, dtype=np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.append("X") pred_net.external_output.append("Y") pred_net.op.add().CopyFrom(core.CreateOperator(opname, ['X'], ['Y'])) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.append("X") ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator(opname + 'FakeFp16NNPI', ['X'], ['Y'])) shape_hints = {"X": (n, m)} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=False, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("X", X) workspace.CreateNet(ref_net) workspace.CreateNet(pred_net_onnxified) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 reference net workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): diff = np.abs(Y_c2 - Y_glow) np.save('/tmp/' + opname + 'diff', diff) print_test_debug_info( opname, { "X": X, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": diff, "maxdiff": np.max(diff) }) assert (0)
def test_slws_fused_8bit_rowwise_all_same(self): # Comment out for predictable debugging np.random.seed(int(time.time())) workspace.ResetWorkspace() n = 1 m = 2 data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 max_segments = 5 max_segment_length = 200 num_lengths = np.random.randint(1, max_segments + 1) # number of segments to run lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype( np.int32 ) num_indices = np.sum(lengths) indices = np.zeros(num_indices, dtype=np.int64) weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype( np.float32 ) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segments * max_segment_length, debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob("Y") if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_8bit_rowwise", { "indices": indices, "data": data, "lengths": lengths, "weights": weights, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0], }, ) assert 0
def test_slws_fused_8bit_rowwise_acc32_nnpi(self): workspace.GlobalInit( [ "caffe2", "--glow_global_fp16=0", "--glow_global_fused_scale_offset_fp16=0", "--glow_global_force_sls_fp16_accum=0", ] ) # Comment out for predictable debugging seed = int(time.time() * 1000) % 2 ** 16 print(seed) np.random.seed(seed) workspace.ResetWorkspace() n = 20000 DIM = 6 data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32) max_segments = 200 max_segment_length = 200 num_lengths = np.random.randint(0, max_segments + 1) # number of segments to run lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype( np.int32 ) num_indices = np.sum(lengths) indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype( np.float32 ) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segments * max_segment_length, debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "test_slws_fused_8bit_rowwise_acc32_nnpi", { "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def test_small_sls_acc32(self): workspace.GlobalInit( [ "caffe2", "--glow_global_fp16=0", "--glow_global_fused_scale_offset_fp16=0", "--glow_global_force_sls_fp16_accum=0", ] ) seed = int(time.time() * 1000) % 2 ** 16 print(seed) np.random.seed(seed) workspace.ResetWorkspace() n = 2 DIM = 3 data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) lengths = np.array([n], dtype=np.int32) indices = np.array(range(n), dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) quantized_data = workspace.FetchBlob("quantized_data") onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=1, max_seq_size=n, debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: np.set_printoptions(precision=12) print( "ref", Y_ref.astype(np.float16).astype(np.float32), "glow", Y_glow.astype(np.float16).astype(np.float32), ) print_test_debug_info( "test_small_sls_acc32", { "seed": seed, "indices": indices, "data": data, "quantized_data": quantized_data, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def Test_SLS_NonQuantized_fp16(self): N = 20000 DIM = 64 D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32) I = (np.random.randint(0, N, size=12)).astype(np.int64) L = np.asarray([4, 4, 4]).astype(np.int32) workspace.FeedBlob("D", D) ref_c2_net = core.Net("test_ref_c2") ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out") ref_c2_net.Proto().external_input.extend(["D", "I", "L"]) ref_c2_net.Proto().external_output.extend(["ref_out"]) fp16_c2_net = core.Net("test_fp16_c2") fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out") input_dict = {} pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["D", "I", "L"]) pred_net.external_output.append("glow_out") pred_net.op.add().CopyFrom( core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"]) ) onnxified_net = onnxifi_caffe2_net( pred_net, input_dict, max_batch_size=3, max_seq_size=16, debug=True, adjust_batch=False, use_onnx=False, ) num_onnxified_ops = sum( 1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op ) print(onnxified_net) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("I", I) workspace.FeedBlob("L", L) workspace.RunNetOnce(ref_c2_net) ref_c2_out = workspace.FetchBlob("ref_out") workspace.RunNetOnce(fp16_c2_net) fp16_c2_out = workspace.FetchBlob("fp16_out") np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3) workspace.RunNetOnce(onnxified_net) fp16_glow_out = workspace.FetchBlob("glow_out") if not np.allclose(fp16_glow_out, fp16_c2_out): diff = np.abs(fp16_glow_out - fp16_c2_out) print_test_debug_info( "sls", { "indices": I, "data": D, "lengths": L, "Y_c2": fp16_c2_out, "Y_glow": fp16_glow_out, "diff": diff, "rowwise_diff": diff[:, 0], }, ) assert 0
def test_bn(self): size = 30 input_channels = 20 batch_size = 40 seed = int(time.time()) np.random.seed(seed) order = "NCHW" epsilon = 1e-3 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator("SpatialBN", ["X", "scale", "bias", "mean", "var"], ["Y"], order=order, is_test=True, epsilon=epsilon)) if GLOW_LOWERED_BATCHNORM: refopname = "SpatialBNFakeLoweredFp16NNPI" else: refopname = "SpatialBNFakeFp16NNPI" pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend( ["X", "scale", "bias", "mean", "var"]) pred_net_ref.external_output.append("X") pred_net_ref.op.add().CopyFrom( core.CreateOperator(refopname, ["X", "scale", "bias", "mean", "var"], ["Y"], order=order, is_test=True, epsilon=epsilon)) scale = np.random.rand(input_channels).astype(np.float32) + 0.5 bias = np.random.rand(input_channels).astype(np.float32) - 0.5 mean = np.random.randn(input_channels).astype(np.float32) var = np.random.rand(input_channels).astype(np.float32) + 0.5 X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 workspace.FeedBlob("scale", scale) workspace.FeedBlob("bias", bias) workspace.FeedBlob("mean", mean) workspace.FeedBlob("var", var) # Use for reference to debug # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order) pred_net_onnxified = onnxifi_caffe2_net(pred_net, { "X": [batch_size, input_channels, size, size], "scale": [input_channels], "bias": [input_channels], "mean": [input_channels], "var": [input_channels] }, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff = np.abs(Y_glow - Y_c2).astype(np.float16) print_test_debug_info( "bn", { "seed": seed, "scale": scale, "bias": bias, "mean": mean, "var": var, "Y_np": Y_c2.shape, "Y_glow": Y_glow.shape, "diff": diff, "rowwise_diff": np.max(np.abs(diff), -1) }) assert (0)
def test_batch_matmul(self, M, K, N, trans_a, trans_b, run_ints, gc, dc): workspace.ResetWorkspace() C = 0 # TODO batch_dims = np.random.randint(low=1, high=3, size=C, dtype=np.int64).tolist() if run_ints: X = np.random.randint(low=1, high=3, size=((1, M, K))).astype(np.float32) else: X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype( np.float32) - 0.5) if trans_a: X = X.swapaxes(-1, -2) if run_ints: Y = np.random.randint(low=1, high=3, size=((1, K, N))).astype(np.float32) else: Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype( np.float32) - 0.5) if trans_b: Y = Y.swapaxes(-1, -2) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "Y"]) pred_net.external_output.append("out") pred_net.op.add().CopyFrom( core.CreateOperator('BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b)) pred_net_ref = core.Net("pred_net_ref") pred_net_ref.BatchMatMulFP16Acc16Fake(["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b) print("dims", batch_dims, X.shape, Y.shape) pred_net_onnxified = onnxifi_caffe2_net(pred_net, { "X": X.shape, "Y": Y.shape }, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) # Run Glow net workspace.RunNet(pred_net_onnxified.name) out_glow = workspace.FetchBlob('out') # Run caffe2 net workspace.RunNet(pred_net_ref) out_c2_fakefp16 = workspace.FetchBlob('out') diff = np.abs((out_c2_fakefp16 - out_glow) / (out_c2_fakefp16 + 1e-8)) rowdiff = np.max(diff, axis=1) success = True if run_ints: if not np.allclose(out_glow, out_c2_fakefp16): success = False else: n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) # Find the max difference per row, if more than 10% of the rows # are bigger, consider it a failure. if n_offenders * 10 > rowdiff.shape[0]: success = False if not success: print_test_debug_info( "bmm", { "m": M, "k": K, "n": N, "X": X, "Y": Y, "out_glow": out_glow, "out_c2_fakefp16": out_c2_fakefp16, "diff": diff }) assert (0)
def _test_binary_op_graph(self, name): # First dimension is the batch size dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3))) A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) print(A.shape, B.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["A", "B"]) pred_net.external_output.append("C") pred_net.op.add().CopyFrom(core.CreateOperator(name, ["A", "B"], ["C"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["A", "B"]) pred_net_ref.external_output.append("C_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( name + "FakeFp16", ["A", "B"], ["C_ref"], )) shape_hints = {"A": A.shape, "B": B.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) num_iterations = 10 for _ in range(num_iterations): A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("C_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("C") # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2, Y_glow): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info( name, { "dims": dims, "A": A, "B": B, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)
def test_fc_exercise(self): """ Test that the matmul engine is working, this doesn't test precision """ m = np.random.randint(low=4, high=50) k = np.random.randint(low=4, high=50) n = np.random.randint(low=4, high=50) dtype = np.float32 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "W0", "b0"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "FC", ["X", "W0", "b0"], ["Y"], )) workspace.SwitchWorkspace("glow_test_ws", True) workspace.ResetWorkspace() W0 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype) b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype) workspace.FeedBlob("W0", W0) workspace.FeedBlob("b0", b0) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {"X": (m, k)}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype) workspace.FeedBlob("X", X0) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net) num_iterations = 2 for _ in range(num_iterations): X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype) workspace.FeedBlob("X", X0) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 net workspace.RunNet(pred_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "fc", { "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": np.abs((Y_c2 - Y_glow) / Y_c2) }) assert (0)
def test_fc_num0(self): """ Test numerics, fix a dimension and determine the ranges of error. Use Fp16FCAcc16 as a reference. """ np.random.seed(int(time.time())) m = np.random.randint(low=4, high=50) k = np.random.randint(low=4, high=1000) n = np.random.randint(low=4, high=50) use_packed = np.random.randint(2) W = "W_packed" if use_packed else "W0" dtype = np.float32 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", W, "b0"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "FbFCPacked" if use_packed else "FC", ["X", W, "b0"], ["Y"], )) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend(["X", W, "b0"]) pred_net_ref.external_output.append("Y") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "Fp16FCAcc16NNPI", ["X", W, "b0"], ["Y"], )) workspace.SwitchWorkspace("glow_test_ws", True) workspace.ResetWorkspace() W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype( np.float32) b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype( np.float32) workspace.FeedBlob("W0", W0) workspace.FeedBlob("b0", b0) workspace.RunOperatorOnce( core.CreateOperator( "FbGemmPack", ['W0'], ['W_packed'], no_packing=True, )) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {"X": (m, k)}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) X0 = np.random.rand(m, k).astype(dtype) - 0.5 workspace.FeedBlob("X", X0) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) num_iterations = 10 for _ in range(num_iterations): X0 = 100 * (np.random.rand(m, k) - 0.5).\ astype(np.float16).astype(np.float32) workspace.FeedBlob("X", X0) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob('Y') diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8)) rowdiff = np.max(diff, axis=1) n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) if n_offenders > 0: print_test_debug_info( "fc", { "iter": _, "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff, "rowdiff": rowdiff }) assert (0)
def test_fc_numeric_cases(self): """ Test numerics, use examples found from the unit test. Use Fp16FCAcc16NNPI as a reference. """ m = 1 k = 20 n = 1 dtype = np.float32 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "W0", "b0"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "FC", ["X", "W0", "b0"], ["Y"], )) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend(["X", "W0", "b0"]) pred_net_ref.external_output.append("Y") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "Fp16FCAcc16NNPI", ["X", "W0", "b0"], ["Y"], )) workspace.SwitchWorkspace("glow_test_ws", True) workspace.ResetWorkspace() W0 = np.array([[ 0.04882812, 0.21520996, 0.1027832, 0.04489136, -0.07635498, 0.14587402, -0.06240845, 0.3918457, 0.46362305, -0.11657715, 0.29174805, 0.02890015, 0.0680542, 0.4255371, -0.42895508, -0.4128418, -0.47973633, 0.33251953, 0.27807617, 0.3701172 ]], dtype=np.float32) b0 = [0.47851562] b0 = np.array(b0, dtype=np.float32) workspace.FeedBlob("W0", W0) workspace.FeedBlob("b0", b0) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {"X": (m, k)}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) X0 = np.random.rand(m, k).astype(dtype) - 0.5 workspace.FeedBlob("X", X0) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) X_inputs = [ np.array([[ -2.94921875e-01, -3.58642578e-01, -1.92871094e-01, 2.81250000e-01, -1.30126953e-01, 2.32696533e-02, -4.55566406e-01, -2.31811523e-01, -1.95190430e-01, -7.76977539e-02, -1.29394531e-01, 2.94677734e-01, 8.96453857e-04, 4.97314453e-01, -6.07604980e-02, 2.55371094e-01, 3.49853516e-01, -1.37695312e-01, 2.95410156e-01, -3.67187500e-01 ]], dtype=np.float32), np.array([[ -0.4494629, -0.22192383, -0.1640625, 0.11480713, -0.09851074, -0.02084351, 0.19091797, -0.17468262, -0.47485352, 0.07489014, 0.03897095, 0.00197601, 0.02835083, -0.27294922, 0.26757812, -0.20996094, -0.31103516, -0.41601562, 0.09918213, -0.07696533 ]], dtype=np.float32), np.array([[ 0.01150513, -0.20507812, 0.46704102, 0.00906372, 0.19848633, 0.3720703, 0.46557617, -0.47436523, -0.35107422, -0.0362854, -0.20812988, 0.41918945, 0.09716797, 0.19897461, 0.3876953, -0.0165863, 0.23535156, 0.29956055, 0.24389648, -0.23486328 ]], dtype=np.float32) ] for i in range(len(X_inputs)): workspace.FeedBlob("X", X_inputs[i]) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob('Y') diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8)) rowdiff = np.max(diff, axis=1) n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) if n_offenders > 0: print_test_debug_info( "fc", { "iter": i, "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff, "rowdiff": rowdiff }) assert (0)