def test_get_ort_device_type_exc_2(self): dev = get_ort_device('cpu') self.assertEqual(get_ort_device_type(dev), 0) dev = get_ort_device('cuda') self.assertEqual(get_ort_device_type(dev), 1) self.assertRaise(lambda: get_ort_device_type(''), ValueError) self.assertRaise(lambda: get_ort_device_type(0), TypeError)
def __init__(self, model_onnx, weights_to_train, loss_output_name='loss', max_iter=100, training_optimizer_name='SGDOptimizer', batch_size=10, eta0=0.01, alpha=0.0001, power_t=0.25, learning_rate='invscaling', device='cpu', verbose=0): # See https://scikit-learn.org/stable/modules/generated/ # sklearn.linear_model.SGDRegressor.html self.model_onnx = model_onnx self.batch_size = batch_size self.weights_to_train = weights_to_train self.loss_output_name = loss_output_name self.training_optimizer_name = training_optimizer_name self.verbose = verbose self.max_iter = max_iter self.eta0 = eta0 self.alpha = alpha self.power_t = power_t self.learning_rate = learning_rate.lower() self.device = get_ort_device(device)
def __init__(self, X, y, batch_size=20, device='cpu'): if len(y.shape) == 1: y = y.reshape((-1, 1)) if X.shape[0] != y.shape[0]: raise ValueError( f"Shape mismatch X.shape={X.shape!r}, y.shape={y.shape!r}.") self.X = numpy.ascontiguousarray(X) self.y = numpy.ascontiguousarray(y) self.batch_size = batch_size self.device = get_ort_device(device)
def test_ort_device_to_string(self): for value in [ 'cpu', 'cuda', ('gpu', 'cuda'), ('gpu:0', 'cuda'), ('cuda:0', 'cuda'), ('gpu:1', 'cuda:1'), 'cuda:1' ]: with self.subTest(device=value): if isinstance(value, str): a, b = value, value else: a, b = value dev = get_ort_device(a) back = ort_device_to_string(dev) self.assertEqual(b, back)
def test_print_ortvalue(self): expected = ("device=Cpu dtype=dtype('float32') shape=(1, 4) " "value=[0.0, 1.0, 4.0, 4.5]") value = numpy.array([[0, 1, 4, 4.5]], dtype=numpy.float32) dev = get_ort_device('cpu') ort = C_OrtValue.ortvalue_from_numpy(value, dev) text = str_ortvalue(ort) self.assertEqual(expected, text) text = str_ortvalue(ort) # pylint: disable=W0212 self.assertEqual(expected, text) expected = ("device=Cpu dtype=dtype('int64') shape=(100,) " "value=[0, 1, 2, 3, 4, '...', 95, 96, 97, 98, 99]") value = numpy.arange(100).astype(numpy.int64) ort = C_OrtValue.ortvalue_from_numpy(value, dev) text = str_ortvalue(ort) # pylint: disable=W0212 self.assertEqual(expected, text)
train_session = create_training_session(onx_train, ['coefs', 'intercept'], device=device) print(train_session) ########################################## # The coefficients. state_tensors = train_session.get_state() pprint(state_tensors) ###################################### # We can now check the coefficients are updated after one iteration. dev = get_ort_device(device) ortx = C_OrtValue.ortvalue_from_numpy(X_train[:1], dev) orty = C_OrtValue.ortvalue_from_numpy(y_train[:1].reshape((-1, 1)), dev) ortlr = C_OrtValue.ortvalue_from_numpy( numpy.array([0.01], dtype=numpy.float32), dev) bind = train_session.io_binding()._iobinding bind.bind_ortvalue_input('X', ortx) bind.bind_ortvalue_input('label', orty) bind.bind_ortvalue_input('Learning_Rate', ortlr) bind.bind_output('loss', dev) train_session._sess.run_with_iobinding(bind, None) outputs = bind.copy_outputs_to_cpu() pprint(outputs) ##########################################
def build_ort_op(op_version=14, save=None, **kwargs): # opset=13, 14, ... slices = kwargs['slices'] slice1, slice2 = slices slice1 = slice(0, None) if slice1 is None else slice(*slice1) slice2 = slice(0, None) if slice2 is None else slice(*slice2) axes = [] starts = [] ends = [] for i in [0, 1]: if slices[i] is None: continue axes.append(i) starts.append(slices[i][0]) ends.append(slices[i][1]) starts = numpy.array(starts, dtype=numpy.int64) ends = numpy.array(ends, dtype=numpy.int64) axes = numpy.array(axes, dtype=numpy.int64) node1 = OnnxSlice('X', starts, ends, axes, op_version=op_version) node2 = OnnxAdd(node1, numpy.array([1], dtype=numpy.float32), op_version=op_version) node3 = OnnxSlice(node2, starts, ends, axes, op_version=op_version) node4 = OnnxMul(node3, numpy.array([2], dtype=numpy.float32), op_version=op_version, output_names=['Y']) onx = node4.to_onnx(inputs=[('X', FloatTensorType([None, None]))], target_opset=op_version) sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"]) if save is not None: with open(save, "wb") as f: f.write(onx.SerializeToString()) def npy_fct(x): return ((x[slice1, slice2] + 1)[slice1, slice2] * 2).copy() rnd = numpy.random.randn(10, 10).astype(numpy.float32) expected = npy_fct(rnd) got = sess.run(None, {'X': rnd})[0] try: assert_almost_equal(expected, got) except AssertionError as e: raise AssertionError("kwargs=%r slice1=%r slice2=%r shapes=%r ? %r " "(x[slice1, slice2].shape)=%r" % (kwargs, slice1, slice2, expected.shape, got.shape, rnd[slice1, slice2].shape)) from e if get_device().upper() == 'GPU': sessg = InferenceSession(onx.SerializeToString(), providers=["CUDAExecutionProvider"]) io_binding = sessg.io_binding()._iobinding device = get_ort_device('cuda:0') def run_gpu(x): io_binding.bind_input('X', device, numpy.float32, x.shape(), x.data_ptr()) io_binding.bind_output('Y', device) return sessg._sess.run_with_iobinding(io_binding, None) return onx, lambda x: sess.run(None, {'X': x}), npy_fct, run_gpu else: return onx, lambda x: sess.run(None, {'X': x}), npy_fct, None
def benchmark_op(repeat=10, number=10, name="Slice", shape_slice_fct=None, save=None, opset=14, repeat_profile=1500, verbose=1): if verbose: print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d" " opset=%d." % (repeat, number, repeat_profile, opset)) res = [] for dim in tqdm([ 8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 600, 784, 800, 1000, 1024, 1200 ]): shape, slices = shape_slice_fct(dim) onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(save=save, op_version=opset, slices=slices) n_arrays = 20 if dim >= 512: n_arrays = 10 xs = [ numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays) ] info = dict(shape=shape) ctx = dict(xs=xs, loop_fct=loop_fct) # numpy ctx['fct'] = npy_fct obs = measure_time(lambda: loop_fct(npy_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time(lambda: loop_fct(ort_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) if ort_fct_gpu is not None: # onnxruntime dev = get_ort_device('cuda:0') ctx['xs'] = [C_OrtValue.ortvalue_from_numpy(x, dev) for x in xs] ctx['fct'] = ort_fct_gpu obs = measure_time(lambda: loop_fct(ort_fct_gpu, ctx['xs']), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort_gpu' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # profiling CPU if verbose: print("[benchmark_op] done.") print("[benchmark_op] profile CPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CPUExecutionProvider"]) for i in range(0, repeat_profile): sess.run( None, {'X': xs[-1]}, ) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprof = DataFrame(OnnxWholeSession.process_profiling(js)) dfprof['shape'] = ",".join(map(str, shape)) dfprof['slices'] = str(slices) if verbose: print("[benchmark_op] done.") # profiling CPU if ort_fct_gpu is not None: if verbose: print("[benchmark_op] profile GPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CUDAExecutionProvider"]) io_binding = sess.io_binding()._iobinding device = get_ort_device('cpu') for i in range(0, repeat_profile): x = ctx['xs'][-1] io_binding.bind_input('X', device, numpy.float32, x.shape(), x.data_ptr()) io_binding.bind_output('Y', device) sess._sess.run_with_iobinding(io_binding, None) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js)) dfprofgpu['shape'] = ",".join(map(str, shape)) dfprofgpu['slices'] = str(slices) if verbose: print("[benchmark_op] profile done.") else: dfprofgpu = None # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) piv = df.pivot('shape', 'fct', 'average') rs = piv.copy() for c in ['numpy', 'ort', 'ort_gpu']: if c in rs.columns: rs[f"numpy/{c}"] = rs['numpy'] / rs[c] rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy() # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title=f"{name} benchmark\n{shape_name!r} lower better") ax[0].legend(prop={"size": 9}) rs.plot( logx=True, logy=True, ax=ax[1], title=f"{name} Speedup, baseline=numpy\n{shape_name!r} higher better") ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return dfprof, dfprofgpu, df, rs, ax
################################# # With onnxruntime. sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"]) y_cpu = sess.run(None, {'X': x})[0] ####################################### # Execution on GPU # ++++++++++++++++ # # If available... if get_device().upper() == 'GPU': dev = get_ort_device('cuda:0') try: gx = C_OrtValue.ortvalue_from_numpy(x, dev) cuda = True except RuntimeError as e: print(e) cuda = False else: cuda = False if cuda: sessg = InferenceSession(onx.SerializeToString(), providers=["CUDAExecutionProvider"]) io_binding = sessg.io_binding()._iobinding io_binding.bind_input('X', dev, numpy.float32, gx.shape(), gx.data_ptr())