class AbsoluteLearningLoss(BaseLearningLoss): """ Implements a square loss :math:`|Y - Z|` where *Y* is the output and *Z* the expected output. See @see fn _onnx_grad_loss_absolute_error for the ONNX implementation. """ def __init__(self): BaseLearningLoss.__init__(self) def build_onnx_function(self, opset, device, weight_name): so = SessionOptions() so.log_severity_level = 4 # loss_grad self.loss_grad_onnx_ = function_onnx_graph("grad_loss_absolute_error", target_opset=opset, weight_name=weight_name) self.loss_grad_sess_ = InferenceSession( self.loss_grad_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.loss_grad_sess_bind_ = ( self.loss_grad_sess_.io_binding()._iobinding) # score self.build_onnx_score_function(opset, device, weight_name)
class NegLogLearningLoss(BaseLearningLoss): """ Implements a negative log loss `'log(yt, yp) = -(1-yt)\\log(1-yp) - yt\\log(yp)`, this only works for a binary classification where *yp* is the predicted probability, *yt* is the expected probability. *yt* is expected to be binary, *yp* is a matrix with two columns, the sum on every line is 1. However, this loss is usually applied after a function softmax and the gradient is directly computed from the loss to the raw score before they are processed through the softmax function (see class `Log <https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ linear_model/_sgd_fast.pyx#L236>`_). :param eps: clipping value for probabilities, avoids computing `log(0)` :param probability_function: function to convert raw scores into probabilities, default value is `sigmoid` for a logistic regression """ def __init__(self, eps=1e-5, probability_function='sigmoid'): BaseLearningLoss.__init__(self) self.eps = eps self.probability_function = probability_function def build_onnx_function(self, opset, device, weight_name): so = SessionOptions() so.log_severity_level = 4 # loss_grad fct_name = f"grad_{self.probability_function}_neg_log_loss_error" self.loss_grad_onnx_ = function_onnx_graph(fct_name, target_opset=opset, weight_name=weight_name, eps=self.eps) self.loss_grad_sess_ = InferenceSession( self.loss_grad_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.loss_grad_sess_bind_ = ( self.loss_grad_sess_.io_binding()._iobinding) # score self.build_onnx_score_function(opset, device, weight_name)
class ElasticLearningLoss(BaseLearningLoss): """ Implements a square loss :math:`(Y - Z)^2 \\alpha + |Y - Z| * \\beta` where *Y* is the output and *Z* the expected output, :math:`\\alpha` is *l2_weight* and :math:`\\beta` is *l1_weight*. :param l1_weight: weight of L1 norm :param l2_weight: weight of L2 norm See @see fn _onnx_grad_loss_elastic_error for the ONNX implementation. """ def __init__(self, l1_weight=0.5, l2_weight=0.5): BaseLearningLoss.__init__(self) self.l1_weight = l1_weight self.l2_weight = l2_weight def build_onnx_function(self, opset, device, weight_name): so = SessionOptions() so.log_severity_level = 4 # loss_grad self.loss_grad_onnx_ = function_onnx_graph("grad_loss_elastic_error", target_opset=opset, weight_name=weight_name, l1_weight=self.l1_weight, l2_weight=self.l2_weight) self.loss_grad_sess_ = InferenceSession( self.loss_grad_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.loss_grad_sess_bind_ = ( self.loss_grad_sess_.io_binding()._iobinding) # score self.build_onnx_score_function(opset, device, weight_name)
def build_ort_op(op_version=14, save=None, **kwargs): # opset=13, 14, ... slices = kwargs['slices'] slice1, slice2 = slices slice1 = slice(0, None) if slice1 is None else slice(*slice1) slice2 = slice(0, None) if slice2 is None else slice(*slice2) axes = [] starts = [] ends = [] for i in [0, 1]: if slices[i] is None: continue axes.append(i) starts.append(slices[i][0]) ends.append(slices[i][1]) starts = numpy.array(starts, dtype=numpy.int64) ends = numpy.array(ends, dtype=numpy.int64) axes = numpy.array(axes, dtype=numpy.int64) node1 = OnnxSlice('X', starts, ends, axes, op_version=op_version) node2 = OnnxAdd(node1, numpy.array([1], dtype=numpy.float32), op_version=op_version) node3 = OnnxSlice(node2, starts, ends, axes, op_version=op_version) node4 = OnnxMul(node3, numpy.array([2], dtype=numpy.float32), op_version=op_version, output_names=['Y']) onx = node4.to_onnx(inputs=[('X', FloatTensorType([None, None]))], target_opset=op_version) sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"]) if save is not None: with open(save, "wb") as f: f.write(onx.SerializeToString()) def npy_fct(x): return ((x[slice1, slice2] + 1)[slice1, slice2] * 2).copy() rnd = numpy.random.randn(10, 10).astype(numpy.float32) expected = npy_fct(rnd) got = sess.run(None, {'X': rnd})[0] try: assert_almost_equal(expected, got) except AssertionError as e: raise AssertionError("kwargs=%r slice1=%r slice2=%r shapes=%r ? %r " "(x[slice1, slice2].shape)=%r" % (kwargs, slice1, slice2, expected.shape, got.shape, rnd[slice1, slice2].shape)) from e if get_device().upper() == 'GPU': sessg = InferenceSession(onx.SerializeToString(), providers=["CUDAExecutionProvider"]) io_binding = sessg.io_binding()._iobinding device = get_ort_device('cuda:0') def run_gpu(x): io_binding.bind_input('X', device, numpy.float32, x.shape(), x.data_ptr()) io_binding.bind_output('Y', device) return sessg._sess.run_with_iobinding(io_binding, None) return onx, lambda x: sess.run(None, {'X': x}), npy_fct, run_gpu else: return onx, lambda x: sess.run(None, {'X': x}), npy_fct, None
class BaseLearningLoss(BaseLearningOnnx): """ Class handling the loss for class @see cl OrtGradientForwardBackwardOptimizer. All classes inheriting from this one creates one ONNX function, returning the loss and the gradient of the loss against the outputs. Method `loss_gradient` is the main method, it computes the loss and the gradient defiend by one ONNX graph and executed by an instance of :epkg:`InferenceSession`. """ def __init__(self): BaseLearningOnnx.__init__(self) self.ro_ = RunOptions() def build_onnx_score_function(self, opset, device, weight_name): """ Assuming the loss function was created. This one takes the onnx graph and generate the onnx graph for the method `loss_score`. """ if not hasattr(self, 'loss_grad_onnx_'): raise RuntimeError( # pragma: no cover "Missing attribute 'loss_grad_onnx_'. " "Method 'build_onnx_function' should be called first.") # score so = SessionOptions() so.log_severity_level = 4 self.loss_score_onnx_ = unreduced_onnx_loss(self.loss_grad_onnx_, 'Y') # pylint: disable=E1101 self.loss_score_sess_ = InferenceSession( self.loss_score_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.loss_score_sess_bind_ = ( self.loss_score_sess_.io_binding()._iobinding) def _call_iobinding(self, sess, bind): sess.run_with_iobinding(bind, self.ro_) def loss_gradient( # pylint: disable=E1101 self, device, expected, predicted, weight=None): """ Returns the loss and the gradient as OrtValue. :param device: device where the training takes place :param expected: expected value :param predicted: predicted value :param weight: optional, training weights (same dimension as expected and predicted tensors) :return: loss and gradient """ if (not hasattr(self, "loss_grad_sess_") or not hasattr(self, "loss_grad_sess_bind_")): raise RuntimeError( # pragma: no cover "Attributes 'loss_grad_sess_bind_' or 'loss_grad_sess_' is " "missing. Method 'build_onnx_function' has not been called.") bind = self.loss_grad_sess_bind_ if weight is not None: self._bind_input_ortvalue("weight", bind, weight, device, cache=True) else: self.clear_binding_inputs("weight", bind, cache=True) self._bind_input_ortvalue("X1", bind, expected, device, cache=True) self._bind_input_ortvalue("X2", bind, predicted, device, cache=True) self.loss_grad_sess_bind_.bind_output('Y', device) self.loss_grad_sess_bind_.bind_output('Y_grad', device) self._call_iobinding(self.loss_grad_sess_._sess, bind) loss, grad = bind.get_outputs() return loss, grad def loss_scores( # pylint: disable=E1101 self, device, expected, predicted, weight=None): """ Returns the weighted loss (or score) for every observation as OrtValue. :param device: device where the training takes place :param expected: expected value :param predicted: predicted value :param weight: optional, training weights (same dimension as expected and predicted tensors) :return: a score for every observation """ if (not hasattr(self, "loss_score_sess_") or not hasattr(self, "loss_score_sess_bind_")): raise RuntimeError( # pragma: no cover "Attributes 'loss_score_sess_bind_' or 'loss_score_sess_' is " "missing. Method 'build_onnx_function' has not been called.") bind = self.loss_score_sess_bind_ if weight is not None: self._bind_input_ortvalue("weight", bind, weight, device, cache=True) else: self.clear_binding_inputs("weight", bind, cache=True) self._bind_input_ortvalue("X1", bind, expected, device, cache=True) self._bind_input_ortvalue("X2", bind, predicted, device, cache=True) self.loss_score_sess_bind_.bind_output('Y', device) self._call_iobinding(self.loss_score_sess_._sess, bind) score = bind.get_outputs() return score[0] @staticmethod def select(class_name, **kwargs): """ Returns an instance of a given initialized with *kwargs*. :param class_name: an instance of @see cl BaseLearningLoss or a string among the following class names (see below) :return: instance of @see cl BaseLearningLoss Possible values for *class_name*: * `'square_error'`: see @see cl SquareLearningLoss * `'absolute_error'`: see @see cl AbsoluteLearningLoss * `'elastic_error'`: see @see cl ElasticLearningLoss """ if isinstance(class_name, BaseLearningLoss): return class_name cls = { SquareLearningLoss: ['square_error', 'square'], AbsoluteLearningLoss: ['absolute_error', 'absolute'], ElasticLearningLoss: ['elastic_error', 'elastic'], NegLogLearningLoss: ['log', 'neglog', 'logloss'] } for cl, aliases in cls.items(): if class_name == cl.__class__.__name__ or class_name in aliases: return cl(**kwargs) raise ValueError( # pragma: no cover "Unexpected class name %r. It should be one of %r." % (class_name, list(map(lambda c: c.__name__, cls))))
def benchmark_op(repeat=10, number=10, name="Slice", shape_slice_fct=None, save=None, opset=14, repeat_profile=1500, verbose=1): if verbose: print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d" " opset=%d." % (repeat, number, repeat_profile, opset)) res = [] for dim in tqdm([ 8, 16, 32, 64, 100, 128, 200, 256, 400, 512, 600, 784, 800, 1000, 1024, 1200 ]): shape, slices = shape_slice_fct(dim) onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(save=save, op_version=opset, slices=slices) n_arrays = 20 if dim >= 512: n_arrays = 10 xs = [ numpy.random.rand(*shape).astype(numpy.float32) for _ in range(n_arrays) ] info = dict(shape=shape) ctx = dict(xs=xs, loop_fct=loop_fct) # numpy ctx['fct'] = npy_fct obs = measure_time(lambda: loop_fct(npy_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'numpy' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # onnxruntime ctx['fct'] = ort_fct obs = measure_time(lambda: loop_fct(ort_fct, xs), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) if ort_fct_gpu is not None: # onnxruntime dev = get_ort_device('cuda:0') ctx['xs'] = [C_OrtValue.ortvalue_from_numpy(x, dev) for x in xs] ctx['fct'] = ort_fct_gpu obs = measure_time(lambda: loop_fct(ort_fct_gpu, ctx['xs']), div_by_number=True, context=ctx, repeat=repeat, number=number) obs['dim'] = dim obs['fct'] = 'ort_gpu' obs['shape'] = ",".join(map(str, shape)) obs['slices'] = str(slices) obs.update(info) res.append(obs) # profiling CPU if verbose: print("[benchmark_op] done.") print("[benchmark_op] profile CPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CPUExecutionProvider"]) for i in range(0, repeat_profile): sess.run( None, {'X': xs[-1]}, ) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprof = DataFrame(OnnxWholeSession.process_profiling(js)) dfprof['shape'] = ",".join(map(str, shape)) dfprof['slices'] = str(slices) if verbose: print("[benchmark_op] done.") # profiling CPU if ort_fct_gpu is not None: if verbose: print("[benchmark_op] profile GPU.") so = SessionOptions() so.enable_profiling = True sess = InferenceSession(onx.SerializeToString(), so, providers=["CUDAExecutionProvider"]) io_binding = sess.io_binding()._iobinding device = get_ort_device('cpu') for i in range(0, repeat_profile): x = ctx['xs'][-1] io_binding.bind_input('X', device, numpy.float32, x.shape(), x.data_ptr()) io_binding.bind_output('Y', device) sess._sess.run_with_iobinding(io_binding, None) prof = sess.end_profiling() with open(prof, "r") as f: js = json.load(f) dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js)) dfprofgpu['shape'] = ",".join(map(str, shape)) dfprofgpu['slices'] = str(slices) if verbose: print("[benchmark_op] profile done.") else: dfprofgpu = None # Dataframes shape_name = str(shape).replace(str(dim), "N") df = pandas.DataFrame(res) piv = df.pivot('shape', 'fct', 'average') rs = piv.copy() for c in ['numpy', 'ort', 'ort_gpu']: if c in rs.columns: rs[f"numpy/{c}"] = rs['numpy'] / rs[c] rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy() # Graphs. fig, ax = plt.subplots(1, 2, figsize=(12, 4)) piv.plot(logx=True, logy=True, ax=ax[0], title=f"{name} benchmark\n{shape_name!r} lower better") ax[0].legend(prop={"size": 9}) rs.plot( logx=True, logy=True, ax=ax[1], title=f"{name} Speedup, baseline=numpy\n{shape_name!r} higher better") ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--') ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--') ax[1].legend(prop={"size": 9}) return dfprof, dfprofgpu, df, rs, ax
class ElasticLearningPenalty(BaseLearningPenalty): """ Implements a L1 or L2 regularization on weights. """ def __init__(self, l1=0.5, l2=0.5): BaseLearningPenalty.__init__(self) self.l1 = l1 self.l2 = l2 def build_onnx_function(self, opset, device, n_tensors): so = SessionOptions() so.log_severity_level = 4 # loss_grad self.penalty_onnx_ = function_onnx_graph("n_penalty_elastic_error", target_opset=opset, n_tensors=n_tensors, loss_shape=None, l1_weight=self.l1, l2_weight=self.l2) self.penalty_sess_ = InferenceSession( self.penalty_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.penalty_sess_bind_ = (self.penalty_sess_.io_binding()._iobinding) self.names_ = [i.name for i in self.penalty_onnx_.graph.input] # weight updates self.penalty_grad_onnx_ = function_onnx_graph( "update_penalty_elastic_error", target_opset=opset, l1=self.l1, l2=self.l2) self.penalty_grad_sess_ = InferenceSession( self.penalty_grad_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.penalty_grad_sess_binds_ = [ self.penalty_grad_sess_.io_binding()._iobinding for n in range(n_tensors) ] def penalty_loss(self, device, *inputs): """ Computes the penalty associated to every weights and adds them up to the loss. :param device: device where the training takes place :param inputs: loss without penalty and weights :return: loss + penatlies """ if (not hasattr(self, "penalty_onnx_") or not hasattr(self, "penalty_sess_bind_")): raise RuntimeError( # pragma: no cover "Attributes 'penalty_sess_bind_' or 'penalty_onnx_' is " "missing. Method 'build_onnx_function' has not been called.") if len(self.names_) != len(inputs): raise RuntimeError( # pragma: no cover f"Mismatched number of inputs: {len(self.names_)} != {len(inputs)}." ) for name, inp in zip(self.names_, inputs): self._bind_input_ortvalue(name, self.penalty_sess_bind_, inp, device, cache=True) self._bind_output_ortvalue('Y', self.penalty_sess_bind_, inputs[0], cache=True) self._call_iobinding(self.penalty_sess_._sess, self.penalty_sess_bind_) return self.penalty_sess_bind_.get_outputs()[0] def update_weights(self, n_bind, device, statei): if (not hasattr(self, "penalty_grad_onnx_") or not hasattr(self, "penalty_grad_sess_binds_")): raise RuntimeError( # pragma: no cover "Attributes 'penalty_grad_sess_binds_' or " "'penalty_grad_onnx_' is missing. Method " "'build_onnx_function' has not been called.") bind = self.penalty_grad_sess_binds_[n_bind] self._bind_input_ortvalue("X", bind, statei, device, cache=True) self._bind_output_ortvalue('Y', bind, statei, cache=True) self._call_iobinding(self.penalty_grad_sess_._sess, bind) return bind.get_outputs()[0] # X
if get_device().upper() == 'GPU': dev = get_ort_device('cuda:0') try: gx = C_OrtValue.ortvalue_from_numpy(x, dev) cuda = True except RuntimeError as e: print(e) cuda = False else: cuda = False if cuda: sessg = InferenceSession(onx.SerializeToString(), providers=["CUDAExecutionProvider"]) io_binding = sessg.io_binding()._iobinding io_binding.bind_input('X', dev, numpy.float32, gx.shape(), gx.data_ptr()) io_binding.bind_output('Y', dev) sessg._sess.run_with_iobinding(io_binding, None) y_gpu = io_binding.copy_outputs_to_cpu()[0] assert_almost_equal(y_cpu, y_gpu) ###################################### # Benchmark # +++++++++ data = [] shapes = ([(n, n) for n in [10, 100, 1000]] + [(n, 100) for n in [10, 100, 1000, 10000]] + [(100, n) for n in [10, 100, 1000, 10000]])
class LearningRateSGDNesterov(LearningRateSGD): """ Implements the learning the same way as :class:`sklearn.linear_model.SGDRegressor`. :param eta0: initial learning rate for the `'constant'`, `'invscaling'` or `'adaptive'` schedules. :param alpha: constant that multiplies the regularization term, the higher the value, the stronger the regularization. Also used to compute the learning rate when set to *learning_rate* is set to `'optimal'`. :param power_t: exponent for inverse scaling learning rate :param learning_rate: learning rate schedule: * `'constant'`: `eta = eta0` * `'optimal'`: `eta = 1.0 / (alpha * (t + t0))` where *t0* is chosen by a heuristic proposed by Leon Bottou, this number is multiplied by a constant C to make the first number equal to *eta0* * `'invscaling'`: `eta = eta0 / pow(t, power_t)` :param momentum: float, default=0.9 Value of momentum used, must be larger than or equal to 0. :param nesterov: bool, default=True Whether to use nesterov's momentum or not. Use nesterov's if True Not using nesterov is equivalent to class @see cl LearningRateSGD. Created attributes: * `eta0_`: initial eta0 * `optimal_init_`: use when `learning_rate=='optimal'` * `value_`: value to be returned by property `value` :: updates = [ self.momentum * velocity - self.learning_rate * grad for velocity, grad in zip(self.velocities, grads)] self.velocities = updates if self.nesterov: updates_nesterov = [ self.momentum * velocity - self.learning_rate * grad for velocity, grad in zip(self.velocities, grads)] return updates, updates_nesterov --> new gradient and velocities else: return updates --> new gradient """ def __init__(self, eta0=0.01, alpha=0.0001, power_t=0.25, learning_rate='invscaling', momentum=0.9, nesterov=True): LearningRateSGD.__init__( self, eta0=eta0, alpha=alpha, power_t=power_t, learning_rate=learning_rate) self.momentum = momentum self.nesterov = nesterov @property def needs_grad(self): """ Returns the True if the gradient update needs to retain past gradients. """ return True def init_learning_rate(self): """ Updates the learning rate at the end of an iteration. :return: self """ return LearningRateSGD.init_learning_rate(self) def update_learning_rate(self, t): """ Updates the learning rate at the end of an iteration. :param t: iteration number :return: self """ return LearningRateSGD.update_learning_rate(self, t) def build_onnx_function(self, opset, device, n_tensors): so = SessionOptions() so.log_severity_level = 4 # axpyw if self.nesterov: self.axpyw_onnx_ = function_onnx_graph("axpyw2") else: self.axpyw_onnx_ = function_onnx_graph("axpyw") self.axpyw_sess_ = InferenceSession( self.axpyw_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.axpyw_sess_binds_ = [ self.axpyw_sess_.io_binding()._iobinding for n in range(n_tensors)] self.alpha_ = numpy.array( [0], dtype=TENSOR_TYPE_TO_NP_TYPE[ self.axpyw_onnx_.graph.input[0].type.tensor_type.elem_type]) self.beta_ = numpy.array( [0], dtype=TENSOR_TYPE_TO_NP_TYPE[ self.axpyw_onnx_.graph.input[0].type.tensor_type.elem_type]) def update_weights(self, n_bind, device, statei, gradienti, batch_size, velocity=None): if (not hasattr(self, "axpyw_onnx_") or not hasattr(self, "axpyw_sess_binds_")): raise RuntimeError( # pragma: no cover "Attributes 'axpyw_sess_binds_' or " "'axpyw_onnx_' is missing. Method " "'build_onnx_function' has not been called.") if velocity is None: raise RuntimeError( # pragma: no cover "Velocity must not be None for this way of updating weights.") bind = self.axpyw_sess_binds_[n_bind] self._bind_input_ortvalue("X1", bind, gradienti, device, cache=True) self._bind_input_ortvalue("X2", bind, statei, device, cache=True) self._bind_input_ortvalue("G", bind, velocity, device, cache=True) self.alpha_[0] = - self.value / batch_size # pylint: disable=E1130 self.beta_[0] = self.momentum ort_alpha = C_OrtValue.ortvalue_from_numpy(self.alpha_, device) ort_beta = C_OrtValue.ortvalue_from_numpy(self.beta_, device) self._bind_input_ortvalue("alpha", bind, ort_alpha, device, cache=True) self._bind_input_ortvalue("beta", bind, ort_beta, device, cache=True) self._bind_output_ortvalue('Y', bind, statei, cache=True) self._bind_output_ortvalue('Z', bind, velocity, cache=True) self._call_iobinding(self.axpyw_sess_._sess, bind) return bind.get_outputs() # loss, velocity
class LearningRateSGD(BaseLearningRate): """ Implements the learning the same way as :class:`sklearn.linear_model.SGDRegressor`. :param eta0: initial learning rate for the `'constant'`, `'invscaling'` or `'adaptive'` schedules. :param alpha: constant that multiplies the regularization term, the higher the value, the stronger the regularization. Also used to compute the learning rate when set to *learning_rate* is set to `'optimal'`. :param power_t: exponent for inverse scaling learning rate :param learning_rate: learning rate schedule: * `'constant'`: `eta = eta0` * `'optimal'`: `eta = 1.0 / (alpha * (t + t0))` where *t0* is chosen by a heuristic proposed by Leon Bottou, this number is multiplied by a constant C to make the first number equal to *eta0* * `'invscaling'`: `eta = eta0 / pow(t, power_t)` Created attributes: * `eta0_`: initial eta0 * `optimal_init_`: use when `learning_rate=='optimal'` * `value_`: value to be returned by property `value` """ def __init__(self, eta0=0.01, alpha=0.0001, power_t=0.25, learning_rate='invscaling'): BaseLearningRate.__init__(self) if learning_rate not in ('invscaling', 'optimal', 'constant'): raise ValueError( f"Unxepected value for learning_rate={learning_rate!r}.") self.eta0 = eta0 self.alpha = alpha self.power_t = power_t self.learning_rate = learning_rate.lower() self.value_ = None @property def value(self): "Returns the current learning rate." if self.value_ is None: raise RuntimeError( # pragma: no cover "Method init_learning_rate was never called.") return self.value_ @property def needs_grad(self): """ Returns the True if the gradient update needs to retain past gradients. """ return False def init_learning_rate(self): """ Updates the learning rate at the end of an iteration. :return: self """ self.eta0_ = self.eta0 if self.learning_rate == "optimal": typw = numpy.sqrt(1.0 / numpy.sqrt(self.alpha)) eta0 = typw / max(1.0, (1 + typw) * 2) self.optimal_init_ = 1.0 / (eta0 * self.alpha) eta = 1. / (self.alpha * self.optimal_init_) self.optimal_fact_ = self.eta0 / eta self.eta0_ = self.eta0 else: self.eta0_ = self.eta0 self.value_ = self.eta0_ return self def update_learning_rate(self, t): """ Updates the learning rate at the end of an iteration. :param t: iteration number :return: self """ eta = self.value_ if self.learning_rate == "optimal": eta = self.optimal_fact_ / (self.alpha * (self.optimal_init_ + t)) elif self.learning_rate == "invscaling": eta = self.eta0_ / numpy.power(t + 1, self.power_t) self.value_ = eta return self def build_onnx_function(self, opset, device, n_tensors): so = SessionOptions() so.log_severity_level = 4 self.axpy_onnx_ = function_onnx_graph("axpy") self.axpy_sess_ = InferenceSession( self.axpy_onnx_.SerializeToString(), so, providers=device_to_providers(device)) self.axpy_sess_binds_ = [ self.axpy_sess_.io_binding()._iobinding for i in range(n_tensors)] self.alpha_ = numpy.array( [0], dtype=TENSOR_TYPE_TO_NP_TYPE[ self.axpy_onnx_.graph.input[0].type.tensor_type.elem_type]) def update_weights(self, n_bind, device, statei, # pylint: disable=W0237 gradienti, batch_size, velocity=None): if velocity is not None: raise RuntimeError( # pragma: no cover "Velocity must be None for this way of updating weights.") if (not hasattr(self, "axpy_onnx_") or not hasattr(self, "axpy_sess_binds_")): raise RuntimeError( # pragma: no cover "Attributes 'axpy_sess_binds_' or " "'axpy_onnx_' is missing. Method " "'build_onnx_function' has not been called.") bind = self.axpy_sess_binds_[n_bind] self._bind_input_ortvalue("X1", bind, gradienti, device, cache=True) self._bind_input_ortvalue("X2", bind, statei, device, cache=True) self.alpha_[0] = - self.value / batch_size # pylint: disable=E1130 ort_alpha = C_OrtValue.ortvalue_from_numpy(self.alpha_, device) self._bind_input_ortvalue("alpha", bind, ort_alpha, device, cache=True) self._bind_output_ortvalue('Y', bind, statei, cache=True) self._call_iobinding(self.axpy_sess_._sess, bind) new_weights = bind.get_outputs()[0] return new_weights