def relu_test(self, inputs, gc, dc, seed): np.random.seed(seed) inputs = np.random.rand(1).astype(np.float32) X = inputs[0] # First dimension is the batch size print(X.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom(core.CreateOperator("Relu", ["X"], ["Y"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["X"]) pred_net_ref.external_output.append("Y_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "ReluFakeFp16", ["X"], ["Y_ref"], )) shape_hints = {"X": X.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.FeedBlob("X", X) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2, Y_glow): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info("Relu", { "seed": seed, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)
def test_logit(self, seed): np.random.seed(seed) workspace.ResetWorkspace() n = 1 m = 15361 X = np.linspace(0, 1, num=m, dtype=np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.append("X") pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator('Logit', ['X'], ['Y'], eps=1e-6)) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.append("X") ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator('LogitFakeFp16NNPI', ['X'], ['Y'], eps=1e-6)) print("REF NET = {}".format(ref_net)) shape_hints = {"X": (n, m)} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("X", X) workspace.CreateNet(ref_net) workspace.CreateNet(pred_net_onnxified) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 reference net workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') diff = np.abs(Y_c2 - Y_glow) if np.nanmax(diff) > 9e-3: np.save('/tmp/logit_diff', diff) np.save('/tmp/logit_result', Y_c2) print_test_debug_info('Logit', { "X": X, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": diff }) assert (0)
def _test_unary_op(self, opname, seed): np.random.seed(seed) workspace.ResetWorkspace() n = 1 m = 10000 X = np.linspace(-20, 20, num=m, dtype=np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.append("X") pred_net.external_output.append("Y") pred_net.op.add().CopyFrom(core.CreateOperator(opname, ['X'], ['Y'])) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.append("X") ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator(opname + 'FakeFp16NNPI', ['X'], ['Y'])) shape_hints = {"X": (n, m)} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=False, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("X", X) workspace.CreateNet(ref_net) workspace.CreateNet(pred_net_onnxified) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 reference net workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): diff = np.abs(Y_c2 - Y_glow) np.save('/tmp/' + opname + 'diff', diff) print_test_debug_info( opname, { "X": X, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": diff, "maxdiff": np.max(diff) }) assert (0)
def _test_unary_op(self, opname, X, rtol=1e-5, atol=1e-8): workspace.ResetWorkspace() pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.append("X") pred_net.external_output.append("Y") pred_net.op.add().CopyFrom(core.CreateOperator(opname, ['X'], ['Y'])) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.append("X") ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator(opname + 'FakeFp16NNPI', ['X'], ['Y'])) print("REF NET = {}".format(ref_net)) shape_hints = {"X": X.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("X", X) workspace.CreateNet(ref_net) workspace.CreateNet(pred_net_onnxified) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 reference net workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow, rtol=atol, atol=atol): diff = np.abs(Y_c2 - Y_glow) np.save('/tmp/' + opname + 'diff', diff) np.save('/tmp/' + opname + 'result', Y_c2) print_test_debug_info(opname, { "X": X, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": diff }) assert (0) return Y_glow
def test_slws_fused_4bit_rowwise(self): # Comment out for predictable debugging seed = int(time.time() * 1000) % 2 ** 16 print(seed) np.random.seed(seed) workspace.ResetWorkspace() n = 20000 DIM = 6 data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32) max_segments = 200 max_segment_length = 200 num_lengths = np.random.randint(0, max_segments + 1) # number of segments to run lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype( np.int32 ) num_indices = np.sum(lengths) indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype( np.float32 ) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused4BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segments * max_segment_length, debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "slws_fused_4bit", { "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight): workspace.ResetWorkspace() np.random.seed(seed) data = np.random.rand(num_rows, embedding_dim).astype(np.float32) lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform(low=0, high=max_weight, size=[len(indices)]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused4BitRowwiseQuantized", ["data"], ["quantized_data"])) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {}, max_batch_size=batch_size, max_seq_size=batch_size * np.max(lengths), debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_4bit_rowwise", { "seed": seed, "indices": indices, "data": data, "lengths": lengths, "weights": weights, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0] }) assert (0)
def test_deq_swish_quant(self): workspace.ResetWorkspace() n = 256 X_fp32 = np.linspace(-20.5, 8., num=n).astype(np.float32).reshape(1, n) Y_fp32 = self._swish(X_fp32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32) W_fp32 = np.identity(n, dtype=np.float32) b_fp32 = np.zeros((n,), dtype=np.float32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W"], ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, ) ) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI( ["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point ) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["U_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net.Int8DequantizeNNPI( ["U_int8"], ["U_fp16"] ) ref_net.SwishFakeFp16NNPI( ["U_fp16"], ["Y_fp16"] ) ref_net.Int8QuantizeNNPI( ["Y_fp16"], ["Y"], Y_scale=Y_scale, Y_zero_point=Y_zero_point ) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchInt8Blob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" ref_net.Proto().op[3].type = "Swish" ref_net.Proto().op[4].type = "Int8Quantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b"], ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) # TODO: add an assertion to check the optimized net # fused Dequantize->Swish->Quantize to QuantizedSwish workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchInt8Blob("Y") Swish_Ips = workspace.FetchBlob("U_fp16") Swish_Ops = workspace.FetchBlob("Y_fp16") diff_Y = np.abs(Y_glow.data.astype(np.int32) - Y_fbgemm.data.astype(np.int32)) num_mismatches = np.count_nonzero(diff_Y) max_diff = np.max(diff_Y) # TODO: Debug the mismatch and make the test pass with max_diff == 0 if max_diff > 1: print_test_debug_info( "QuantizedSwish", { "X": X_fp32, "Swish_Ips": Swish_Ips, "Swish_Ops": Swish_Ops, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "max_diff": max_diff, "num_mismatches": num_mismatches, }, ) assert 0
def test_small_sls(self, seed): np.random.seed(seed) workspace.ResetWorkspace() n = 2 DIM = 3 data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) lengths = np.array([n], dtype=np.int32) indices = np.array(range(n), dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) quantized_data = workspace.FetchBlob("quantized_data") onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=1, max_seq_size=n, debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: np.set_printoptions(precision=12) print( "ref", Y_ref.astype(np.float16).astype(np.float32), "glow", Y_glow.astype(np.float16).astype(np.float32), ) print_test_debug_info( "slws_fused_8bit_rowwise_inv_scale", { "seed": seed, "indices": indices, "data": data, "quantized_data": quantized_data, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight): np.random.seed(seed) workspace.ResetWorkspace() data = np.random.rand(num_rows, embedding_dim).astype(np.float32) lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform( low=0, high=max_weight, size=[len(indices)] ).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=batch_size, max_seq_size=batch_size * np.max(lengths), debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "slws_fused_8bit_rowwise_inv_scale", { "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def test_batch_matmul(self, M, K, N, C, rand_seed, trans_a, trans_b, run_ints): np.random.seed(rand_seed) workspace.ResetWorkspace() batch_dims = [C] if run_ints: X = np.random.randint(low=1, high=3, size=((C, M, K))).astype(np.float32) else: X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype( np.float32) - 0.5) if trans_a: X = X.swapaxes(-1, -2) if run_ints: Y = np.random.randint(low=1, high=3, size=((C, K, N))).astype(np.float32) else: Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype( np.float32) - 0.5) if trans_b: Y = Y.swapaxes(-1, -2) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "Y"]) pred_net.external_output.append("out") pred_net.op.add().CopyFrom( core.CreateOperator('BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b)) pred_net_ref = core.Net("pred_net_ref") # Reference updated to fp16 with fp32 accumulation pred_net_ref.BatchMatMulFP16Acc32Fake(["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b) print("dims", batch_dims, X.shape, Y.shape) pred_net_onnxified = onnxifi_caffe2_net(pred_net, { "X": X.shape, "Y": Y.shape }, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) # Run Glow net workspace.RunNet(pred_net_onnxified.name) out_glow = workspace.FetchBlob('out') # Run caffe2 net workspace.RunNet(pred_net_ref) out_c2_fakefp16 = workspace.FetchBlob('out') diff = np.abs(out_c2_fakefp16 - out_glow) if not np.allclose(out_glow, out_c2_fakefp16): print_test_debug_info( "bmm", { "seed": rand_seed, "m": M, "k": K, "n": N, "X": X.shape, "Y": Y.shape, "trans_a": trans_a, "trans_b": trans_b, "run_ints": run_ints, "out_glow": out_glow, "out_c2_fakefp16": out_c2_fakefp16, "diff": diff }) assert (0)
def test_fc_exercise(self): """ Test that the matmul engine is working, this doesn't test precision """ m = np.random.randint(low=4, high=50) k = np.random.randint(low=4, high=50) n = np.random.randint(low=4, high=50) dtype = np.float32 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "W0", "b0"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "FC", ["X", "W0", "b0"], ["Y"], )) workspace.SwitchWorkspace("glow_test_ws", True) workspace.ResetWorkspace() W0 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype) b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype) workspace.FeedBlob("W0", W0) workspace.FeedBlob("b0", b0) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {"X": (m, k)}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype) workspace.FeedBlob("X", X0) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net) num_iterations = 2 for _ in range(num_iterations): X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype) workspace.FeedBlob("X", X0) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 net workspace.RunNet(pred_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "fc", { "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": np.abs((Y_c2 - Y_glow) / Y_c2) }) assert (0)
def test_fc_num0(self, seed): """ Test numerics, fix a dimension and determine the ranges of error. Use Fp16FCAcc16 as a reference. """ np.random.seed(seed) m = np.random.randint(low=4, high=50) k = np.random.randint(low=4, high=1000) n = np.random.randint(low=4, high=50) use_packed = np.random.randint(2) W = "W_packed" if use_packed else "W0" dtype = np.float32 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", W, "b0"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "FbFCPacked" if use_packed else "FC", ["X", W, "b0"], ["Y"], )) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend(["X", W, "b0"]) pred_net_ref.external_output.append("Y") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "Fp16FCAcc16NNPI", ["X", W, "b0"], ["Y"], )) workspace.SwitchWorkspace("glow_test_ws", True) workspace.ResetWorkspace() W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype( np.float32) b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype( np.float32) workspace.FeedBlob("W0", W0) workspace.FeedBlob("b0", b0) workspace.RunOperatorOnce( core.CreateOperator( "FbGemmPack", ['W0'], ['W_packed'], no_packing=True, )) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {"X": (m, k)}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) X0 = np.random.rand(m, k).astype(dtype) - 0.5 workspace.FeedBlob("X", X0) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) num_iterations = 10 for _ in range(num_iterations): X0 = 100 * (np.random.rand(m, k) - 0.5).\ astype(np.float16).astype(np.float32) workspace.FeedBlob("X", X0) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob('Y') diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8)) rowdiff = np.max(diff, axis=1) n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) if n_offenders > 0: print_test_debug_info( "fc", { "iter": _, "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff, "rowdiff": rowdiff }) assert (0)
def test_fc_numeric_cases(self): """ Test numerics, use examples found from the unit test. Use Fp16FCAcc16NNPI as a reference. """ m = 1 k = 20 n = 1 dtype = np.float32 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "W0", "b0"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "FC", ["X", "W0", "b0"], ["Y"], )) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend(["X", "W0", "b0"]) pred_net_ref.external_output.append("Y") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "Fp16FCAcc16NNPI", ["X", "W0", "b0"], ["Y"], )) workspace.SwitchWorkspace("glow_test_ws", True) workspace.ResetWorkspace() W0 = np.array([[ 0.04882812, 0.21520996, 0.1027832, 0.04489136, -0.07635498, 0.14587402, -0.06240845, 0.3918457, 0.46362305, -0.11657715, 0.29174805, 0.02890015, 0.0680542, 0.4255371, -0.42895508, -0.4128418, -0.47973633, 0.33251953, 0.27807617, 0.3701172 ]], dtype=np.float32) b0 = [0.47851562] b0 = np.array(b0, dtype=np.float32) workspace.FeedBlob("W0", W0) workspace.FeedBlob("b0", b0) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {"X": (m, k)}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) X0 = np.random.rand(m, k).astype(dtype) - 0.5 workspace.FeedBlob("X", X0) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) X_inputs = [ np.array([[ -2.94921875e-01, -3.58642578e-01, -1.92871094e-01, 2.81250000e-01, -1.30126953e-01, 2.32696533e-02, -4.55566406e-01, -2.31811523e-01, -1.95190430e-01, -7.76977539e-02, -1.29394531e-01, 2.94677734e-01, 8.96453857e-04, 4.97314453e-01, -6.07604980e-02, 2.55371094e-01, 3.49853516e-01, -1.37695312e-01, 2.95410156e-01, -3.67187500e-01 ]], dtype=np.float32), np.array([[ -0.4494629, -0.22192383, -0.1640625, 0.11480713, -0.09851074, -0.02084351, 0.19091797, -0.17468262, -0.47485352, 0.07489014, 0.03897095, 0.00197601, 0.02835083, -0.27294922, 0.26757812, -0.20996094, -0.31103516, -0.41601562, 0.09918213, -0.07696533 ]], dtype=np.float32), np.array([[ 0.01150513, -0.20507812, 0.46704102, 0.00906372, 0.19848633, 0.3720703, 0.46557617, -0.47436523, -0.35107422, -0.0362854, -0.20812988, 0.41918945, 0.09716797, 0.19897461, 0.3876953, -0.0165863, 0.23535156, 0.29956055, 0.24389648, -0.23486328 ]], dtype=np.float32) ] for i in range(len(X_inputs)): workspace.FeedBlob("X", X_inputs[i]) # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob('Y') diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8)) rowdiff = np.max(diff, axis=1) n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) if n_offenders > 0: print_test_debug_info( "fc", { "iter": i, "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff, "rowdiff": rowdiff }) assert (0)
def test_ParallelFC(self, m, k, n, scale, zp, rand_seed): np.random.seed(rand_seed) workspace.ResetWorkspace() # Y = W_T * X + b X_fp32 = np.random.uniform(-1, 1, size=(m, k)).astype(np.float16) \ .astype(np.float32) W_fp32 = np.random.uniform(-1, 1, size=(n, k)).astype(np.float32) b_fp32 = np.zeros((n, ), dtype=np.float32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W"], ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, )) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI(["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["Y_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net.Int8Relu( ["Y_int8"], ["Y_relu"], Y_zero_point=X_zero_point, Y_scale=X_scale, ) ref_net.Int8DequantizeNNPI(["Y_relu"], ["Y"]) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchBlob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Relu" ref_net.Proto().op[3].type = "Int8Dequantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b"], ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op) print(net_onnxified) np.testing.assert_equal(num_onnxified_ops, 1) workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_fbgemm): diff_Y = np.abs(Y_glow - Y_fbgemm) print_test_debug_info( "int8_fc", { "seed": rand_seed, "n": n, "X": X_fp32, "W": W_fp32, "b": b_fp32, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "maxdiff": diff_Y.max(axis=1), }, ) assert 0
def test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() axis = 1 dims = np.array(([batch_size, size])) X = np.random.uniform(size=dims).astype(np.float32) - 0.5 gamma = np.random.randn(*X.shape[axis:]).astype(np.float32) beta = np.random.randn(*X.shape[axis:]).astype(np.float32) Y = self._layernorm_transform(X) scale, zp = self._get_scale_zp(Y) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "gamma", "beta"]) pred_net.external_output.extend(["Y_q"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) pred_net.op.add().CopyFrom( core.CreateOperator("Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp)) print(pred_net) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X", "gamma", "beta"]) pred_net_ref.external_output.extend(["Y_q"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormInt8QuantizeFakeNNPI", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y_q", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine, Y_scale=scale, Y_zero_point=zp)) shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hits, debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("gamma", gamma) workspace.FeedBlob("beta", beta) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchInt8Blob("Y_q") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchInt8Blob("Y_q") if not np.allclose(Y_glow.data, Y_c2.data) or \ Y_glow.scale != Y_c2.scale or Y_glow.zero_point != Y_c2.zero_point: diff_Y = np.abs( Y_glow.data.astype(np.float32) - Y_c2.data.astype(np.float32)) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "batch_size": batch_size, "epsilon": epsilon, "gamma": gamma, "beta": beta, "elementwise_affine": elementwise_affine, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff_Y": diff_Y, }) assert (0)
def test_bn(self, seed): workspace.ResetWorkspace() size = 30 input_channels = 20 batch_size = 40 np.random.seed(seed) order = "NCHW" epsilon = 1e-3 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SpatialBN", ["X", "scale", "bias", "mean", "var"], ["Y"], order=order, is_test=True, epsilon=epsilon ) ) if GLOW_LOWERED_BATCHNORM: refopname = "SpatialBNFakeLoweredFp16NNPI" else: refopname = "SpatialBNFakeFp16NNPI" pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend(["X", "scale", "bias", "mean", "var"]) pred_net_ref.external_output.append("X") pred_net_ref.op.add().CopyFrom( core.CreateOperator( refopname, ["X", "scale", "bias", "mean", "var"], ["Y"], order=order, is_test=True, epsilon=epsilon ) ) scale = np.random.rand(input_channels).astype(np.float32) + 0.5 bias = np.random.rand(input_channels).astype(np.float32) - 0.5 mean = np.random.randn(input_channels).astype(np.float32) var = np.random.rand(input_channels).astype(np.float32) + 0.5 X = np.random.rand( batch_size, input_channels, size, size).astype(np.float32) - 0.5 workspace.FeedBlob("scale", scale) workspace.FeedBlob("bias", bias) workspace.FeedBlob("mean", mean) workspace.FeedBlob("var", var) # Use for reference to debug # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {"X": [batch_size, input_channels, size, size], "scale": [input_channels], "bias": [input_channels], "mean": [input_channels], "var": [input_channels]}, debug=True, adjust_batch=False, use_onnx=False ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff = np.abs(Y_glow - Y_c2).astype(np.float16) print_test_debug_info( "bn", {"seed": seed, "scale": scale, "bias": bias, "mean": mean, "var": var, "Y_np": Y_c2.shape, "Y_glow": Y_glow.shape, "diff": diff, "rowwise_diff": np.max(np.abs(diff), -1)}) assert(0)
def test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() axis = 1 dims = np.array(([batch_size, size])) X = np.random.uniform(size=dims).astype(np.float32) - 0.5 gamma = np.random.randn(*X.shape[axis:]).astype(np.float32) beta = np.random.randn(*X.shape[axis:]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "gamma", "beta"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X", "gamma", "beta"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16NNPI", ["X", "gamma", "beta"] if elementwise_affine else ["X"], ["Y", "mean", "rstd"], axis=axis, epsilon=epsilon, elementwise_affine=elementwise_affine)) shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hits, debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("gamma", gamma) workspace.FeedBlob("beta", beta) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") dims1 = np.array(([1, *dims])) X_glow = X.reshape(dims1) workspace.FeedBlob("X", X_glow) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "batch_size": batch_size, "epsilon": epsilon, "gamma": gamma, "beta": beta, "elementwise_affine": elementwise_affine, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff_Y": diff_Y, }) assert (0)
def test_slws_fused_8bit_rowwise_all_same(self, seed): # Comment out for predictable debugging np.random.seed(seed) workspace.ResetWorkspace() n = 1 m = 2 data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 max_segments = 5 max_segment_length = 200 num_lengths = np.random.randint(1, max_segments + 1) # number of segments to run lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype( np.int32 ) num_indices = np.sum(lengths) indices = np.zeros(num_indices, dtype=np.int64) weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype( np.float32 ) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"] ) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], ) ) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator( "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] ) ) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segments * max_segment_length, debug=True, adjust_batch=True, use_onnx=False, ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob("Y") if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_8bit_rowwise", { "seed": seed, "indices": indices, "data": data, "lengths": lengths, "weights": weights, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0], }, ) assert 0
def test_layernorm(self, seed, size, input_channels, batch_size, epsilon): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X"], ["Y", "mean", "rstd"], # axis=-1, epsilon=epsilon)) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred_ref" pred_net_ref.external_input.extend(["X"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16", ["X"], ["Y", "mean", "rstd"], # axis=-1, epsilon=epsilon)) X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 pred_net_onnxified = onnxifi_caffe2_net( pred_net, {"X": [batch_size, input_channels, size, size]}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net) workspace.CreateNet(pred_net_ref) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") mean_c2 = workspace.FetchBlob("mean") std_c2 = workspace.FetchBlob("rstd") workspace.RunNet(pred_net.name) Y_glow = workspace.FetchBlob("Y") mean_glow = workspace.FetchBlob("mean") std_glow = workspace.FetchBlob("rstd") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16) diff_std = np.abs(std_glow - std_c2).astype(np.float16) diff_mean = np.abs(mean_glow - mean_c2).astype(np.float16) print_test_debug_info( "layernorm", { "seed": seed, "size": size, "input_channels": input_channels, "batch_size": batch_size, "epsilon": epsilon, "X": X, "Y_glow": Y_glow, "mean_glow": mean_glow, "std_glow": std_glow, "Y_c2": Y_c2, "mean_c2": mean_c2, "std_c2": std_c2, "diff_Y": diff_Y, "diff_mean": diff_mean, "diff_std": diff_std, }) assert (0)
def Skip_test_SLS_NonQuantized_fp16(self): N = 20000 DIM = 64 D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32) I = (np.random.randint(0, N, size=12)).astype(np.int64) L = np.asarray([4, 4, 4]).astype(np.int32) workspace.FeedBlob("D", D) ref_c2_net = core.Net("test_ref_c2") ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out") ref_c2_net.Proto().external_input.extend(["D", "I", "L"]) ref_c2_net.Proto().external_output.extend(["ref_out"]) fp16_c2_net = core.Net("test_fp16_c2") fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out") input_dict = {} pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["D", "I", "L"]) pred_net.external_output.append("glow_out") pred_net.op.add().CopyFrom( core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"]) ) onnxified_net = onnxifi_caffe2_net( pred_net, input_dict, max_batch_size=3, max_seq_size=16, debug=True, adjust_batch=False, use_onnx=False, ) num_onnxified_ops = sum( 1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op ) print(onnxified_net) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("I", I) workspace.FeedBlob("L", L) workspace.RunNetOnce(ref_c2_net) ref_c2_out = workspace.FetchBlob("ref_out") workspace.RunNetOnce(fp16_c2_net) fp16_c2_out = workspace.FetchBlob("fp16_out") np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3) workspace.RunNetOnce(onnxified_net) fp16_glow_out = workspace.FetchBlob("glow_out") if not np.allclose(fp16_glow_out, fp16_c2_out): diff = np.abs(fp16_glow_out - fp16_c2_out) print_test_debug_info( "sls", { "indices": I, "data": D, "lengths": L, "Y_c2": fp16_c2_out, "Y_glow": fp16_glow_out, "diff": diff, "rowwise_diff": diff[:, 0], }, ) assert 0
def test_int8_fc(self, n, m, k, rand_seed, quantize_bias, f): print( f"n={n}, m={m}, k={k}, rand_seed={rand_seed}, quantize_bias={quantize_bias}" ) np.random.seed(rand_seed) workspace.ResetWorkspace() ff = float(f) X_fp32 = np.random.uniform(-ff, ff, size=(m, k)).astype(np.float32) W_fp32 = np.random.uniform(-ff, ff, size=(n, k)).astype(np.float32) b_fp32 = np.random.uniform(-ff, ff, size=(n)).astype(np.float32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) Y_fp32 = np.dot(X_fp32, W_fp32.T) + b_fp32 Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W", "b"] if quantize_bias else ["W"], ["W_int8", "b_int32"] if quantize_bias else ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, )) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI(["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b_int32" if quantize_bias else "b"], ["Y_int8"], Y_scale=Y_scale, Y_zero_point=Y_zero_point, ) ref_net.Int8DequantizeNNPI(["Y_int8"], ["Y"]) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchBlob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b_int32"] if quantize_bias else ["W_int8", "b"], ) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_fbgemm): diff_Y = np.abs(Y_glow - Y_fbgemm) print_test_debug_info( "int8_fc", { "seed": rand_seed, "n": n, "m": m, "k": k, "X": X_fp32, "W": W_fp32, "b": b_fp32, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "maxdiff": diff_Y.max(axis=1), }, ) assert 0
def test_int8_quantize(self, n, rand_seed): print("n={}, rand_seed={}".format(n, rand_seed)) np.random.seed(rand_seed) workspace.ResetWorkspace() X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32) W_fp32 = np.identity(n, dtype=np.float32) b_fp32 = np.zeros((n,), dtype=np.float32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W"], ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, ) ) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI( ["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point ) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["Y_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net.Int8DequantizeNNPI( ["Y_int8"], ["Y"] ) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchBlob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b"], ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchBlob("Y") diff = Y_fbgemm - Y_glow if np.count_nonzero(diff) * 10 > diff.size: print_test_debug_info( "int8_fc", { "seed": rand_seed, "n": n, "X": X_fp32, "W": W_fp32, "b": b_fp32, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff, "maxdiff": diff.max(axis=1), }, ) assert 0
def Skip_test_tanhquantize(self, scale, zp, size, rand_seed): np.random.seed(rand_seed) workspace.ResetWorkspace() pred_net = caffe2_pb2.NetDef() pred_net.name = "ref" pred_net.external_input.append("X") pred_net.external_output.append("Y_q") pred_net.op.add().CopyFrom( core.CreateOperator( "Tanh", ["X"], ["Y"] ) ) pred_net.op.add().CopyFrom( core.CreateOperator( "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp ) ) X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {"X": X.shape}, debug=True, adjust_batch=False, use_onnx=False, ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchInt8Blob("Y_q") ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.append("X") ref_net.external_output.append("Y_q") ref_net.op.add().CopyFrom( core.CreateOperator( "TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp ) ) workspace.CreateNet(ref_net) workspace.RunNet(ref_net.name) Y_ref = workspace.FetchInt8Blob("Y_q") if not np.array_equal(Y_ref.data, Y_glow.data) or \ not Y_ref.scale == Y_glow.scale or \ not Y_ref.zero_point == Y_glow.zero_point: print_test_debug_info( "tanhfusion", { "scale": scale, "zp": zp, "input": X, "ideal nonquant": np.tanh(X), "Y_glow": Y_glow, "Y_c2": Y_ref, } ) assert(0)
def _test_binary_op_graph(self, name): # First dimension is the batch size dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3))) A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) print(A.shape, B.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["A", "B"]) pred_net.external_output.append("C") pred_net.op.add().CopyFrom(core.CreateOperator(name, ["A", "B"], ["C"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["A", "B"]) pred_net_ref.external_output.append("C_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( name + "FakeFp16", ["A", "B"], ["C_ref"], )) shape_hints = {"A": A.shape, "B": B.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) num_iterations = 10 for _ in range(num_iterations): A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("C_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("C") # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2, Y_glow): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info( name, { "dims": dims, "A": A, "B": B, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)
def test_batch_matmul(self, M, K, N, trans_a, trans_b, run_ints, gc, dc): workspace.ResetWorkspace() C = 0 # TODO batch_dims = np.random.randint(low=1, high=3, size=C, dtype=np.int64).tolist() if run_ints: X = np.random.randint(low=1, high=3, size=((1, M, K))).astype(np.float32) else: X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype( np.float32) - 0.5) if trans_a: X = X.swapaxes(-1, -2) if run_ints: Y = np.random.randint(low=1, high=3, size=((1, K, N))).astype(np.float32) else: Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype( np.float32) - 0.5) if trans_b: Y = Y.swapaxes(-1, -2) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "Y"]) pred_net.external_output.append("out") pred_net.op.add().CopyFrom( core.CreateOperator('BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b)) pred_net_ref = core.Net("pred_net_ref") pred_net_ref.BatchMatMulFP16Acc16Fake(["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b) print("dims", batch_dims, X.shape, Y.shape) pred_net_onnxified = onnxifi_caffe2_net(pred_net, { "X": X.shape, "Y": Y.shape }, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) # Run Glow net workspace.RunNet(pred_net_onnxified.name) out_glow = workspace.FetchBlob('out') # Run caffe2 net workspace.RunNet(pred_net_ref) out_c2_fakefp16 = workspace.FetchBlob('out') diff = np.abs((out_c2_fakefp16 - out_glow) / (out_c2_fakefp16 + 1e-8)) rowdiff = np.max(diff, axis=1) success = True if run_ints: if not np.allclose(out_glow, out_c2_fakefp16): success = False else: n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL]) # Find the max difference per row, if more than 10% of the rows # are bigger, consider it a failure. if n_offenders * 10 > rowdiff.shape[0]: success = False if not success: print_test_debug_info( "bmm", { "m": M, "k": K, "n": N, "X": X, "Y": Y, "out_glow": out_glow, "out_c2_fakefp16": out_c2_fakefp16, "diff": diff }) assert (0)