class GRNN(LazyLearning, BaseNetwork): """ Generalized Regression Neural Network. Parameters ---------- std : float standard deviation for PDF function, default to 0.1. {Verbose.verbose} Methods ------- {LazyLearning.train} {BaseSkeleton.predict} {BaseSkeleton.fit} Examples -------- >>> from sklearn import datasets >>> from sklearn.cross_validation import train_test_split >>> from neupy import algorithms, estimators, environment >>> >>> environment.reproducible() >>> >>> dataset = datasets.load_diabetes() >>> x_train, x_test, y_train, y_test = train_test_split( ... dataset.data, dataset.target, train_size=0.7, ... random_state=0 ... ) >>> >>> nw = algorithms.GRNN(std=0.1, verbose=False) >>> nw.train(x_train, y_train) >>> result = nw.predict(x_test) >>> estimators.rmsle(result, y_test) 0.4245120142774001 """ std = BoundedProperty(default=0.1, minval=0) def train(self, input_train, target_train, copy=True): input_train = format_data(input_train, copy=copy) target_train = format_data(target_train, copy=copy) if target_train.shape[1] != 1: raise ValueError("Target value must be one dimensional array") LazyLearning.train(self, input_train, target_train) def predict(self, input_data): super(GRNN, self).predict(input_data) input_data = format_data(input_data) input_data_size = input_data.shape[1] train_data_size = self.input_train.shape[1] if input_data_size != train_data_size: raise ValueError("Input data must contains {0} features, got " "{1}".format(train_data_size, input_data_size)) ratios = pdf_between_data(self.input_train, input_data, self.std) return (dot(self.target_train.T, ratios) / ratios.sum(axis=0)).T
class Hessian(NoStepSelection, GradientDescent): """ Hessian gradient decent optimization. This GD algorithm variation using second derivative information helps choose better gradient direction and as a consequence better weight update parameter after eqch epoch. Parameters ---------- penalty_const : float Inverse hessian could be singular matrix. For this reason algorithm include penalty that add to hessian matrix identity multiplied by defined constant. Defaults to ``1``. {GradientDescent.Parameters} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} See Also -------- :network:`HessianDiagonal` : Hessian diagonal approximation. """ penalty_const = BoundedProperty(default=1, minval=0) def init_variables(self): super(Hessian, self).init_variables() n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') def init_train_updates(self): n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) hessian_inverse = T.nlinalg.matrix_inverse(hessian_matrix + penalty_const * T.eye(n_parameters)) # removed adding an identity matrix to # hessian_inverse = T.nlinalg.matrix_inverse(hessian_matrix) updated_parameters = param_vector - hessian_inverse.dot(full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) updates.append((self.variables.hessian, hessian_matrix)) return updates
class Hessian(NoStepSelection, GradientDescent): """ Hessian gradient decent optimization. This GD algorithm variation using second derivative information helps choose better gradient direction and as a consequence better weight update parameter after eqch epoch. Parameters ---------- penalty_const : float Inverse hessian could be singular matrix. For this reason algorithm include penalty that add to hessian matrix identity multiplied by defined constant. Defaults to ``1``. {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} See Also -------- :network:`HessianDiagonal` : Hessian diagonal approximation. """ penalty_const = BoundedProperty(default=1, minval=0) def init_train_updates(self): n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) hessian_inverse = T.nlinalg.matrix_inverse(hessian_matrix + penalty_const * T.eye(n_parameters)) updated_parameters = param_vector - hessian_inverse.dot(full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) return updates
class WeightDecay(WeightUpdateConfigurable): """ Weight decay algorithm penalizes large weights. Also known as L2-regularization. Parameters ---------- decay_rate : float Controls training penalties during the parameter updates. The larger the value the stronger effect regularization has during the training. Defaults to ``0.1``. Warns ----- {WeightUpdateConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... decay_rate=0.1, ... addons=[algorithms.WeightDecay] ... ) See Also -------- :network:`WeightElimination` """ decay_rate = BoundedProperty(default=0.1, minval=0) def init_train_updates(self): original_updates = super(WeightDecay, self).init_train_updates() parameters = [param for _, _, param in iter_parameters(self.layers)] modified_updates = [] step = self.variables.step decay_rate = asfloat(self.decay_rate) for parameter, updated in original_updates: if parameter in parameters: updated -= step * decay_rate * parameter modified_updates.append((parameter, updated)) return modified_updates
class WeightDecay(WeightUpdateConfigurable): """ Weight decay algorithm penalizes large weights and limits the freedom in network. The algorithm is able to solve one of the possible problems of network's overfitting. Parameters ---------- decay_rate : float Controls the effect of penalties on the update network weights. Defaults to ``0.1``. Warns ----- {WeightUpdateConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... decay_rate=0.1, ... addons=[algorithms.WeightDecay] ... ) See Also -------- :network:`WeightElimination` """ decay_rate = BoundedProperty(default=0.1, minval=0) def init_param_updates(self, layer, parameter): updates = super(WeightDecay, self).init_param_updates( layer, parameter ) step = self.variables.step updates_mapper = dict(updates) updates_mapper[parameter] -= step * self.decay_rate * parameter return list(updates_mapper.items())
class Quickprop(GradientDescent): """ Quickprop :network:`GradientDescent` algorithm optimization. Parameters ---------- upper_bound : float Maximum possible value for weight update. Defaults to ``1``. {GradientDescent.Parameters} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> qpnet = algorithms.Quickprop((2, 3, 1)) >>> qpnet.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ upper_bound = BoundedProperty(default=1, minval=0) def init_param_updates(self, layer, parameter): step = self.variables.step parameter_shape = T.shape(parameter).eval() prev_delta = theano.shared( name="{}/prev-delta".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) prev_gradient = theano.shared( name="{}/prev-grad".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) grad_delta = T.abs_(prev_gradient - gradient) parameter_delta = ifelse( T.eq(self.variables.epoch, 1), gradient, T.clip( T.abs_(prev_delta) * gradient / grad_delta, -self.upper_bound, self.upper_bound ) ) return [ (parameter, parameter - step * parameter_delta), (prev_gradient, gradient), (prev_delta, parameter_delta), ]
class LeakStepAdaptation(SingleStepConfigurable): """ Leak Learning Rate Adaptation algorithm is a step adaptation procedure in backpropagation algortihm. Parameters ---------- leak_size : float Defaults to ``0.01``. This variable identified proportion, so it's always between 0 and 1. Typically this value is small. alpha : float The ``alpha`` is control total step update ratio. Defaults to ``0.001``. Typically this value is small. beta : float This similar to ``alpha``, but it control ration only for update matrix norms. Defaults to ``20``. Typically this value is bigger than ``1``. Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... addons=[algorithms.LeakStepAdaptation] ... ) References ---------- [1] Noboru M. "Adaptive on-line learning in changing environments", 1997 [2] LeCun, "Efficient BackProp", 1998 """ leak_size = ProperFractionProperty(default=0.01) alpha = BoundedProperty(default=0.001, minval=0) beta = BoundedProperty(default=20, minval=0) def init_variables(self): super(LeakStepAdaptation, self).init_variables() n_parameters = count_parameters(self.connection) self.variables.leak_average = tf.Variable( tf.zeros(n_parameters), name="leak-step-adapt/leak-average", dtype=tf.float32, ) def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = asfloat(self.alpha) beta = asfloat(self.beta) leak_size = asfloat(self.leak_size) step = self.variables.step leak_average = self.variables.leak_average parameters = parameter_values(self.connection) gradients = tf.gradients(self.variables.error_func, parameters) full_gradient = tf.concat([flatten(grad) for grad in gradients], axis=0) leak_avarage_update = ((1 - leak_size) * leak_average + leak_size * full_gradient) new_step = step + alpha * step * (beta * tf.norm(leak_avarage_update) - step) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
class LinearSearch(SingleStepConfigurable): """ Linear search for the step selection. Basicly this algorithms try different steps and compute your predicted error, after few iteration it will chose one which was better. Parameters ---------- tol : float Tolerance for termination, default to ``0.1``. Can be any number greater that zero. search_method : 'gloden', 'brent' Linear search method. Can be ``golden`` for golden search or ``brent`` for Brent's search, default to ``golden``. Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from sklearn import datasets, preprocessing >>> from sklearn.cross_validation import train_test_split >>> from neupy import algorithms, layers, estimators, environment >>> >>> environment.reproducible() >>> >>> dataset = datasets.load_boston() >>> data, target = dataset.data, dataset.target >>> >>> data_scaler = preprocessing.MinMaxScaler() >>> target_scaler = preprocessing.MinMaxScaler() >>> >>> x_train, x_test, y_train, y_test = train_test_split( ... data_scaler.fit_transform(data), ... target_scaler.fit_transform(target), ... train_size=0.85 ... ) >>> >>> cgnet = algorithms.ConjugateGradient( ... connection=[ ... layers.Input(13), ... layers.Sigmoid(50), ... layers.Sigmoid(1), ... ], ... search_method='golden', ... addons=[algorithms.LinearSearch], ... verbose=False ... ) >>> >>> cgnet.train(x_train, y_train, epochs=100) >>> y_predict = cgnet.predict(x_test).round(1) >>> >>> real = target_scaler.inverse_transform(y_test) >>> predicted = target_scaler.inverse_transform(y_predict) >>> >>> error = estimators.rmsle(real, predicted) >>> error 0.20752676697596578 See Also -------- :network:`ConjugateGradient` """ tol = BoundedProperty(default=0.1, minval=0) maxiter = BoundedProperty(default=10, minval=1) search_method = ChoiceProperty(choices=['golden', 'brent'], default='golden') def train_epoch(self, input_train, target_train): train_epoch = self.methods.train_epoch prediction_error = self.methods.prediction_error params = [param for param, _ in self.init_train_updates()] param_defaults = [param.get_value() for param in params] def setup_new_step(new_step): for param_default, param in zip(param_defaults, params): param.set_value(param_default) self.variables.step.set_value(asfloat(new_step)) train_epoch(input_train, target_train) # Train epoch returns neural network error that was before # training epoch step, that's why we need to compute # it second time. error = prediction_error(input_train, target_train) return np.where(np.isnan(error), np.inf, error) options = {'xtol': self.tol} if self.search_method == 'brent': options['maxiter'] = self.maxiter res = minimize_scalar( setup_new_step, tol=self.tol, method=self.search_method, options=options, ) return setup_new_step(res.x)
class GRNN(BaseSkeleton): """ Generalized Regression Neural Network (GRNN). Network applies only to the regression problems. Parameters ---------- std : float Standard deviation for PDF function. If your input features have high values than standard deviation should also be high. For instance, if input features from range ``[0, 20]`` that standard deviation should be also a big value like ``10`` or ``15``. Small values will lead to bad prediction. {Verbose.verbose} Notes ----- - GRNN Network is sensitive for cases when one input feature has higher values than the other one. Input data has to be normalized before training. - Standard deviation has to match the range of the input features Check ``std`` parameter description for more information. - The bigger training dataset the slower prediction. Algorithm is much more efficient for small datasets. - Network uses lazy learning which mean that network doesn't need iterative training. It just stores parameters and use them to make a predictions. Methods ------- train(X_train, y_train, copy=True) Network just stores all the information about the data and use it for the prediction. Parameter ``copy`` copies input data before saving it inside the network. predict(X) Return prediction per each sample in the ``X``. {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> from sklearn import datasets, preprocessing >>> from sklearn.model_selection import train_test_split >>> from neupy import algorithms >>> >>> dataset = datasets.load_diabetes() >>> x_train, x_test, y_train, y_test = train_test_split( ... preprocessing.minmax_scale(dataset.data), ... preprocessing.minmax_scale(dataset.target.reshape(-1, 1)), ... test_size=0.3, ... ) >>> >>> nw = algorithms.GRNN(std=0.1, verbose=False) >>> nw.train(x_train, y_train) >>> >>> y_predicted = nw.predict(x_test) >>> mse = np.mean((y_predicted - y_test) ** 2) >>> mse 0.05280970704568171 """ std = BoundedProperty(minval=0) def __init__(self, std, verbose=False): self.std = std self.X_train = None self.y_train = None super(GRNN, self).__init__(verbose=verbose) def train(self, X_train, y_train, copy=True): """ Trains network. PNN doesn't actually train, it just stores input data and use it for prediction. Parameters ---------- X_train : array-like (n_samples, n_features) y_train : array-like (n_samples,) Target variable should be vector or matrix with one feature column. copy : bool If value equal to ``True`` than input matrices will be copied. Defaults to ``True``. Raises ------ ValueError In case if something is wrong with input data. """ X_train = format_data(X_train, copy=copy) y_train = format_data(y_train, copy=copy) if y_train.shape[1] != 1: raise ValueError("Target value must be one dimensional array") self.X_train = X_train self.y_train = y_train if X_train.shape[0] != y_train.shape[0]: raise ValueError("Number of samples in the input and target " "datasets are different") def predict(self, X): """ Make a prediction from the input data. Parameters ---------- X : array-like (n_samples, n_features) Raises ------ ValueError In case if something is wrong with input data. Returns ------- array-like (n_samples,) """ if self.X_train is None: raise NotTrained( "Cannot make a prediction. Network hasn't been trained yet") X = format_data(X) if X.shape[1] != self.X_train.shape[1]: raise ValueError("Input data must contain {0} features, got {1}" "".format(self.X_train.shape[1], X.shape[1])) ratios = pdf_between_data(self.X_train, X, self.std) return (dot(self.y_train.T, ratios) / ratios.sum(axis=0)).T
class LevenbergMarquardt(NoStepSelection, GradientDescent): """ Levenberg-Marquardt algorithm. Notes ----- * Network minimizes only Mean Squared Error function. Parameters ---------- mu : float Control invertion for J.T * J matrix, defaults to `0.1`. mu_update_factor : float Factor to decrease the mu if update decrese the error, otherwise increse mu by the same factor. error: {{'mse'}} Levenberg-Marquardt works only for quadratic functions. Defaults to ``mse``. {GradientDescent.addons} {ConstructableNetwork.connection} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} Examples -------- Simple example >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> lmnet = algorithms.LevenbergMarquardt( ... (2, 3, 1), ... verbose=False ... ) >>> lmnet.train(x_train, y_train) Diabets dataset example >>> import numpy as np >>> from sklearn import datasets, preprocessing >>> from sklearn.cross_validation import train_test_split >>> from neupy import algorithms, layers >>> from neupy.estimators import rmsle >>> >>> dataset = datasets.load_diabetes() >>> data, target = dataset.data, dataset.target >>> >>> data_scaler = preprocessing.MinMaxScaler() >>> target_scaler = preprocessing.MinMaxScaler() >>> >>> x_train, x_test, y_train, y_test = train_test_split( ... data_scaler.fit_transform(data), ... target_scaler.fit_transform(target), ... train_size=0.85 ... ) >>> >>> # Network ... lmnet = algorithms.LevenbergMarquardt( ... connection=[ ... layers.Sigmoid(10), ... layers.Sigmoid(40), ... layers.Output(1), ... ], ... mu_update_factor=2, ... mu=0.1, ... step=0.25, ... show_epoch=10, ... use_bias=False, ... verbose=False ... ) >>> lmnet.train(x_train, y_train, epochs=100) >>> y_predict = lmnet.predict(x_test) >>> >>> error = rmsle(target_scaler.inverse_transform(y_test), ... target_scaler.inverse_transform(y_predict).round()) >>> error 0.47548200957888398 See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ mu = BoundedProperty(default=0.01, minval=0) mu_update_factor = BoundedProperty(default=5, minval=1) error = ChoiceProperty(default='mse', choices={'mse': errors.mse}) def init_variables(self): super(LevenbergMarquardt, self).init_variables() self.variables.update( mu=theano.shared(name='mu', value=asfloat(self.mu)), last_error=theano.shared(name='last_error', value=np.nan), ) def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates def on_epoch_start_update(self, epoch): super(LevenbergMarquardt, self).on_epoch_start_update(epoch) last_error = self.errors.last() if last_error is not None: self.variables.last_error.set_value(last_error)
class WeightElimination(WeightUpdateConfigurable): """ Weight Elimination algorithm penalizes large weights and limits the freedom in network. The algorithm is able to solve one of the possible problems of network overfitting. Parameters ---------- decay_rate : float Controls the effect of penalties on the update network weights. Defaults to ``0.1``. zero_weight : float Second important parameter for weights penalization. Defaults to ``1``. Small value can make all weights close to zero. Big value will make less significant contribution in weight update. That mean with a big value ``zero_weight`` network allow higher values for the weights. Warns ----- {WeightUpdateConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... verbose=False, ... addons=[algorithms.WeightElimination] ... ) See Also -------- :network:`WeightDecay` : Weight Decay penalty. Notes ----- Before adding that regularization parameter carefully choose ``decay_rate`` and ``zero_weight`` parameters for the problem. Invalid parameters could significatly reduce weight sizes and norm could be near zero. .. [1] Weigend, A. S.; Rumelhart, D. E. & Huberman, B. A. (1991), \ Generalization by Weight-Elimination with Application to Forecasting, \ in Richard P. Lippmann; John E. Moody & David S. Touretzky, ed., \ Advances in Neural Information Processing Systems, San Francisco, \ CA: Morgan Kaufmann, pp. 875--882 . """ decay_rate = BoundedProperty(default=0.1, minval=0) zero_weight = BoundedProperty(default=1, minval=0) def init_param_updates(self, layer, parameter): updates = super(WeightElimination, self).init_param_updates(layer, parameter) step = self.variables.step decay_koef = self.decay_rate * step zero_weight_square = self.zero_weight**2 updates_mapper = dict(updates) updates_mapper[parameter] -= decay_koef * ( (2 * parameter / zero_weight_square) / (1 + (parameter**2) / zero_weight_square)**2) return list(updates_mapper.items())
class A(Configurable): bounded_property = BoundedProperty(minval=-1, maxval=1)
class RPROP(StepSelectionBuiltIn, GradientDescent): """ RPROP :network:`GradientDescent` algorithm optimization. Parameters ---------- minstep : float Minimum possible value for step. Defaults to ``0.1``. maxstep : float Maximum possible value for step. Defaults to ``50``. increase_factor : float Increase factor for step in case when gradient doesn't change sign compare to previous epoch. decrease_factor : float Decrease factor for step in case when gradient changes sign compare to previous epoch. {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} Examples -------- Simple example >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> rpropnet = algorithms.RPROP( ... (2, 3, 1), ... verbose=False ... ) >>> rpropnet.train(x_train, y_train) See Also -------- :network:`IRPROPPlus` : iRPROP+ algorithm. :network:`GradientDescent` : GradientDescent algorithm. """ # This properties correct upper and lower bounds for steps. minstep = BoundedProperty(default=0.1, minval=0) maxstep = BoundedProperty(default=50, minval=0) # This properties increase/decrease step by deviding it to # some coeffitient. increase_factor = BoundedProperty(minval=1, default=1.2) decrease_factor = ProperFractionProperty(default=0.5) def init_layers(self): super(RPROP, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_delta = theano.shared( name="prev_delta_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_gradient = theano.shared( name="prev_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.steps = theano.shared( name="steps_{}" + parameter.name, value=asfloat(np.ones(parameter_shape) * self.step), ) def init_prev_delta(self, parameter): return parameter.prev_delta def init_param_updates(self, layer, parameter): gradient = T.grad(self.variables.error_func, wrt=parameter) steps = parameter.steps prev_delta = self.init_prev_delta(parameter) prev_gradient = parameter.prev_gradient grad_product = prev_gradient * gradient negative_gradients = T.lt(grad_product, 0) updated_steps = T.clip( T.switch( T.gt(grad_product, 0), steps * self.increase_factor, T.switch( negative_gradients, steps * self.decrease_factor, steps ) ), self.minstep, self.maxstep, ) gradient_signs = T.switch(T.lt(gradient, 0), -1, 1) parameter_delta = T.switch( negative_gradients, prev_delta, gradient_signs * updated_steps ) updated_prev_gradient = T.switch(negative_gradients, 0, gradient) return [ (parameter, parameter - parameter_delta), (steps, updated_steps), (prev_gradient, updated_prev_gradient), (parameter.prev_delta, -parameter_delta), ]
class LevenbergMarquardt(NoStepSelection, GradientDescent): """ Levenberg-Marquardt algorithm. Notes ----- * Network minimizes only Mean Squared Error function. Parameters ---------- mu : float Control invertion for J.T * J matrix, defaults to `0.1`. mu_update_factor : float Factor to decrease the mu if update decrese the error, otherwise increse mu by the same factor. Defaults to ``1.2`` error: {{'mse'}} Levenberg-Marquardt works only for quadratic functions. Defaults to ``mse``. {GradientDescent.addons} {ConstructableNetwork.connection} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> lmnet = algorithms.LevenbergMarquardt( ... (2, 3, 1), ... verbose=False ... ) >>> lmnet.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ mu = BoundedProperty(default=0.01, minval=0) mu_update_factor = BoundedProperty(default=1.2, minval=1) error = ChoiceProperty(default='mse', choices={'mse': errors.mse}) def init_variables(self): super(LevenbergMarquardt, self).init_variables() self.variables.update( mu=theano.shared(name='mu', value=asfloat(self.mu)), last_error=theano.shared(name='last_error', value=np.nan), ) def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates def on_epoch_start_update(self, epoch): super(LevenbergMarquardt, self).on_epoch_start_update(epoch) last_error = self.errors.last() if last_error is not None: self.variables.last_error.set_value(last_error)
class Quickprop(GradientDescent): """ Quickprop :network:`GradientDescent` algorithm optimization. Parameters ---------- upper_bound : float Maximum possible value for weight update. Defaults to ``1``. {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} Examples -------- Simple example >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> qpnet = algorithms.Quickprop( ... (2, 3, 1), ... verbose=False ... ) >>> qpnet.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ upper_bound = BoundedProperty(default=1, minval=0) def init_layers(self): super(Quickprop, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_delta = theano.shared( name="prev_delta_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_gradient = theano.shared( name="prev_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) prev_delta = parameter.prev_delta prev_gradient = parameter.prev_gradient grad_delta = T.abs_(prev_gradient - gradient) parameter_delta = ifelse( T.eq(self.variables.epoch, 1), gradient, T.clip( T.abs_(prev_delta) * gradient / grad_delta, -self.upper_bound, self.upper_bound)) return [ (parameter, parameter - step * parameter_delta), (prev_gradient, gradient), (prev_delta, parameter_delta), ]
class HebbRule(BaseStepAssociative): """ Hebbian Learning Unsupervised Neural Network. Network can learn associations from data and emulate similar behaviour as dog in Pavlov experiment. Notes ----- * Network always generate weights which contains ``0`` weight for \ conditioned stimulus and ``1`` otherwise. This setup helps you controll \ your default state for learning features. Other type of weight you can \ setup as optional parameter ``weight`` in input layer. * No bias. Parameters ---------- decay_rate : float Decay rate is control your network weights. It helps network 'forgote' information and control weight sizes. Without this parameter network weight will grow. Defaults to ``0.2``. {BaseAssociative.n_inputs} {BaseAssociative.n_outputs} {BaseStepAssociative.n_unconditioned} {BaseAssociative.weight} {BaseStepAssociative.bias} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {BaseAssociative.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} Examples -------- >>> import numpy as np >>> from neupy import algorithms, layers >>> ... pavlov_dog_data = np.array([ ... [1, 0], # food, no bell ... [1, 1], # food, bell ... ]) >>> dog_test_cases = np.array([ ... [0, 0], # no food, no bell ... [0, 1], # no food, bell ... [1, 0], # food, no bell ... [1, 1], # food, bell ... ]) >>> >>> hebbnet = algorithms.HebbRule( ... layers.Step(2) > layers.Output(1), ... n_unconditioned=1, ... step=0.1, ... decay_rate=0.8, ... verbose=False ... ) >>> hebbnet.train(pavlov_dog_data, epochs=2) >>> hebbnet.predict(dog_test_cases) array([[-1], [ 1], [ 1], [ 1]]) """ decay_rate = BoundedProperty(default=0.2, minval=0) def weight_delta(self, input_row, layer_output): n_unconditioned = self.n_unconditioned weight = self.weight[n_unconditioned:, :] delta = input_row[:, n_unconditioned:].T.dot(layer_output) return -self.decay_rate * weight + self.step * delta
class Hessian(BaseOptimizer): """ Hessian gradient decent optimization, also known as Newton's method. This algorithm uses second-order derivative (hessian matrix) in order to choose correct step during the training iteration. Because of this, method doesn't have ``step`` parameter. Parameters ---------- penalty_const : float Inverse hessian could be singular matrix. For this reason algorithm include penalty that add to hessian matrix identity multiplied by defined constant. Defaults to ``1``. {BaseOptimizer.network} {BaseOptimizer.loss} {BaseOptimizer.regularizer} {BaseOptimizer.show_epoch} {BaseOptimizer.shuffle_data} {BaseOptimizer.signals} {BaseOptimizer.verbose} Attributes ---------- {BaseOptimizer.Attributes} Methods ------- {BaseOptimizer.Methods} Notes ----- - Method requires all training data during propagation, which means it cannot be trained with mini-batches. - This method calculates full hessian matrix which means it will compute matrix with NxN parameters, where N = number of parameters in the network. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.Hessian(network) >>> optimizer.train(x_train, y_train) See Also -------- :network:`HessianDiagonal` : Hessian diagonal approximation. """ penalty_const = BoundedProperty(default=1, minval=0) step = WithdrawProperty() def init_train_updates(self): penalty_const = asfloat(self.penalty_const) n_parameters = self.network.n_parameters variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.loss, parameters ) parameter_update = tf.matrix_solve( hessian_matrix + penalty_const * tf.eye(n_parameters), tf.reshape(full_gradient, [-1, 1]) ) updated_parameters = param_vector - flatten(parameter_update) updates = setup_parameter_updates(parameters, updated_parameters) return updates
class RPROP(StepSelectionBuiltIn, BaseGradientDescent): """ Resilient backpropagation (RPROP) is an optimization algorithm for supervised learning. RPROP algorithm takes into account only direction of the gradient and completely ignores its magnitude. Every weight values has a unique step size associated with it (by default all of the are equal to ``step``). The rule is following, when gradient direction changes (sign of the gradient) we decrease step size for specific weight multiplying it by ``decrease_factor`` and if sign stays the same than we increase step size for this specific weight multiplying it by ``increase_factor``. The step size is always bounded by ``minstep`` and ``maxstep``. Notes ----- Algorithm doesn't work with mini-batches. Parameters ---------- minstep : float Minimum possible value for step. Defaults to ``0.001``. maxstep : float Maximum possible value for step. Defaults to ``10``. increase_factor : float Increase factor for step in case when gradient doesn't change sign compare to previous epoch. decrease_factor : float Decrease factor for step in case when gradient changes sign compare to previous epoch. {BaseGradientDescent.Parameters} Attributes ---------- {BaseGradientDescent.Attributes} Methods ------- {BaseGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> rpropnet = algorithms.RPROP((2, 3, 1)) >>> rpropnet.train(x_train, y_train) See Also -------- :network:`IRPROPPlus` : iRPROP+ algorithm. :network:`GradientDescent` : GradientDescent algorithm. """ # This properties correct upper and lower bounds for steps. minstep = BoundedProperty(default=0.001, minval=0) maxstep = BoundedProperty(default=10, minval=0) # This properties increase/decrease step by deviding it to # some coeffitient. increase_factor = BoundedProperty(minval=1, default=1.2) decrease_factor = ProperFractionProperty(default=0.5) def update_prev_delta(self, prev_delta): return prev_delta def init_train_updates(self): updates = [] for layer, parameter, gradient in self.iter_params_and_grads(): with tf.variable_scope(parameter.op.name): steps = tf.Variable( # Steps will be decreased after the first iteration, # because all previous gradients are equal to zero. # In order to make sure that network will use the same # step per every weight we re-scale step and after the # first iteration it will be multiplied by # ``decrease_factor`` and scaled back to the default # step value. tf.ones_like(parameter) * self.step, name="steps", dtype=tf.float32, ) prev_delta = tf.Variable( tf.zeros(parameter.shape), name="prev-delta", dtype=tf.float32, ) # We collect only signs since it ensures numerical stability # after multiplication when we deal with small numbers. prev_gradient_sign = tf.Variable( tf.zeros(parameter.shape), name="prev-grad-sign", dtype=tf.float32, ) updated_prev_delta = self.update_prev_delta(prev_delta) gradient_sign = tf.sign(gradient) grad_sign_product = gradient_sign * prev_gradient_sign gradient_changed_sign = tf.equal(grad_sign_product, -1) updated_steps = tf.clip_by_value( tf.where( tf.equal(grad_sign_product, 1), steps * self.increase_factor, tf.where( gradient_changed_sign, steps * self.decrease_factor, steps, ) ), self.minstep, self.maxstep, ) parameter_delta = tf.where( gradient_changed_sign, # If we subtract previous negative weight update it means # that we will revert weight update that has been applied # in the previous iteration. -updated_prev_delta, updated_steps * gradient_sign, ) # Making sure that during the next iteration sign, after # we multiplied by the new gradient, won't be negative. # Otherwise, the same roll back using previous delta # won't make much sense. clipped_gradient_sign = tf.where( gradient_changed_sign, tf.zeros_like(gradient_sign), gradient_sign, ) updates.extend([ (parameter, parameter - parameter_delta), (steps, updated_steps), (prev_gradient_sign, clipped_gradient_sign), (prev_delta, parameter_delta), ]) return updates
class ModifiedRelaxation(BaseLinearNetwork): """ Modified Relaxation Neural Network. Simple linear network. If the output value of the network received more than the set limit, the weight is updated in the same way as the :network:`LMS`, if less than the set value - the update will be in proportion to the expected result. Parameters ---------- dead_zone_radius : float Indicates the line between stable outcome network output and weak, and depending on the result of doing different updates. {BaseLinearNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> input_data = np.array([[1, 0], [2, 2], [3, 3], [0, 0]]) >>> target_data = np.array([[1], [0], [0], [1]]) >>> >>> mrnet = algorithms.ModifiedRelaxation((2, 1), step=1, verbose=False) >>> mrnet.train(input_data, target_data, epochs=100) >>> mrnet.predict(np.array([[4, 4], [0, 0]])) array([[0], [1]]) See Also -------- :network:`LMS` : LMS Neural Network. """ dead_zone_radius = BoundedProperty(default=0.1, minval=0) def init_layer_updates(self, layer): prediction_func = self.variables.train_prediction_func network_output = self.variables.network_output network_input = self.variables.network_input step = self.variables.step normalized_input = network_input / network_input.norm(L=2) summated_output = network_input.dot(layer.weight) + layer.bias linear_error = prediction_func - network_output update = T.where( T.abs_(summated_output) >= self.dead_zone_radius, linear_error, network_output) weight_delta = normalized_input.T.dot(update) bias_delta = linear_error.sum(axis=0) return [ (layer.weight, layer.weight - step * weight_delta), (layer.bias, layer.bias - step * bias_delta), ]
class PNN(LazyLearning, BaseNetwork): """ Probabilistic Neural Network for classification. Parameters ---------- std : float standard deviation for PDF function, default to 0.1. {Verbose.verbose} Methods ------- {LazyLearning.train} {BaseSkeleton.predict} {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> >>> from sklearn import datasets >>> from sklearn import metrics >>> from sklearn.cross_validation import train_test_split >>> from neupy import algorithms, environment >>> >>> environment.reproducible() >>> >>> dataset = datasets.load_digits() >>> x_train, x_test, y_train, y_test = train_test_split( ... dataset.data, dataset.target, train_size=0.7 ... ) >>> >>> nw = algorithms.PNN(std=10, verbose=False) >>> nw.train(x_train, y_train) >>> result = nw.predict(x_test) >>> metrics.accuracy_score(y_test, result) 0.98888888888888893 """ std = BoundedProperty(default=0.1, minval=0) def __init__(self, **options): super(PNN, self).__init__(**options) self.classes = None def train(self, input_train, target_train, copy=True): input_train = format_data(input_train, copy=copy) target_train = format_data(target_train, copy=copy) LazyLearning.train(self, input_train, target_train) if target_train.shape[1] != 1: raise ValueError("Target value must be in 1 dimention") classes = self.classes = unique(target_train) number_of_classes = classes.size row_comb_matrix = self.row_comb_matrix = zeros( (number_of_classes, input_train.shape[0])) class_ratios = self.class_ratios = zeros(number_of_classes) for i, class_name in enumerate(classes): class_val_positions = (target_train == i) row_comb_matrix[i, class_val_positions.ravel()] = 1 class_ratios[i] = np_sum(class_val_positions) def predict_proba(self, input_data): raw_output = self.predict_raw(input_data) total_output_sum = raw_output.sum(axis=0).reshape( (raw_output.shape[1], 1)) return raw_output.T / total_output_sum def predict_raw(self, input_data): input_data = format_data(input_data) super(PNN, self).predict(input_data) if self.classes is None: raise ValueError("Train network before predict data") input_data_size = input_data.shape[1] train_data_size = self.input_train.shape[1] if input_data_size != train_data_size: raise ValueError("Input data must contains {0} features, got " "{1}".format(train_data_size, input_data_size)) class_ratios = self.class_ratios pdf_outputs = pdf_between_data(self.input_train, input_data, self.std) return dot(self.row_comb_matrix, pdf_outputs) / class_ratios.reshape( (class_ratios.size, 1)) def predict(self, input_data): raw_output = self.predict_raw(input_data) return self.classes[raw_output.argmax(axis=0)]
class GRNN(LazyLearningMixin, BaseNetwork): """ Generalized Regression Neural Network (GRNN). Network applies only to the regression problems. Parameters ---------- std : float Standard deviation for PDF function, defaults to ``0.1``. If your input features have high values than standard deviation should also be high. For instance, if input features from range ``[0, 20]`` that standard deviation should be also a big value like ``10`` or ``15``. Small values will lead to bad prediction. {Verbose.verbose} Notes ----- - GRNN Network is sensitive for cases when one input feature has higher values than the other one. Before use it make sure that input values are normalized and have similar scales. - Make sure that standard deviation in the same range as input features. Check ``std`` parameter description for more information. - The bigger training dataset the slower prediction. It's much more efficient for small datasets. {LazyLearningMixin.Notes} Methods ------- {LazyLearningMixin.train} {BaseSkeleton.predict} {BaseSkeleton.fit} Examples -------- >>> from sklearn import datasets, preprocessing >>> from sklearn.model_selection import train_test_split >>> from neupy import algorithms, estimators, environment >>> >>> environment.reproducible() >>> >>> dataset = datasets.load_diabetes() >>> x_train, x_test, y_train, y_test = train_test_split( ... preprocessing.minmax_scale(dataset.data), ... preprocessing.minmax_scale(dataset.target.reshape((-1, 1))), ... test_size=0.3, ... ) >>> >>> nw = algorithms.GRNN(std=0.1, verbose=False) >>> nw.train(x_train, y_train) >>> >>> y_predicted = nw.predict(x_test) >>> estimators.rmse(y_predicted, y_test) 0.2381013391408185 """ std = BoundedProperty(default=0.1, minval=0) def train(self, input_train, target_train, copy=True): """ Trains network. PNN doesn't actually train, it just stores input data and use it for prediction. Parameters ---------- input_train : array-like (n_samples, n_features) target_train : array-like (n_samples,) Target variable should be vector or matrix with one feature column. copy : bool If value equal to ``True`` than input matrices will be copied. Defaults to ``True``. Raises ------ ValueError In case if something is wrong with input data. """ input_train = format_data(input_train, copy=copy) target_train = format_data(target_train, copy=copy) n_target_features = target_train.shape[1] if n_target_features != 1: raise ValueError("Target value must be one dimensional array") LazyLearningMixin.train(self, input_train, target_train) def predict(self, input_data): """ Make a prediction from the input data. Parameters ---------- input_data : array-like (n_samples, n_features) Raises ------ ValueError In case if something is wrong with input data. Returns ------- array-like (n_samples,) """ if self.input_train is None: raise NotTrained("Cannot make a prediction. Network " "hasn't been trained yet") input_data = format_data(input_data) input_data_size = input_data.shape[1] train_data_size = self.input_train.shape[1] if input_data_size != train_data_size: raise ValueError("Input data must contain {0} features, got " "{1}".format(train_data_size, input_data_size)) ratios = pdf_between_data(self.input_train, input_data, self.std) return (dot(self.target_train.T, ratios) / ratios.sum(axis=0)).T
class Hessian(StepSelectionBuiltIn, GradientDescent): """ Hessian gradient decent optimization. This GD algorithm variation using second derivative information helps choose better gradient direction and as a consequence better weight update parameter after each epoch. Parameters ---------- penalty_const : float Inverse hessian could be singular matrix. For this reason algorithm include penalty that add to hessian matrix identity multiplied by defined constant. Defaults to ``1``. {GradientDescent.connection} {GradientDescent.error} {GradientDescent.show_epoch} {GradientDescent.shuffle_data} {GradientDescent.epoch_end_signal} {GradientDescent.train_end_signal} {GradientDescent.verbose} {GradientDescent.addons} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Hessian((2, 3, 1)) >>> mnet.train(x_train, y_train) See Also -------- :network:`HessianDiagonal` : Hessian diagonal approximation. """ penalty_const = BoundedProperty(default=1, minval=0) step = WithdrawProperty() def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in parameters]) penalty_const = asfloat(self.penalty_const) print n_parameters self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) updated_parameters = hessian_matrix updates = setup_parameter_updates([self.variables.hessian], updated_parameters) return updates
class LevenbergMarquardt(StepSelectionBuiltIn, BaseGradientDescent): """ Levenberg-Marquardt algorithm is a variation of the Newton's method. It minimizes MSE error. The algorithm approximates Hessian matrix using dot product between two jacobian matrices. Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. - Network minimizes only Mean Squared Error (MSE) loss function. - Efficient for small training datasets, because it computes gradient per each sample separately. - Efficient for small-sized networks. Parameters ---------- {BaseGradientDescent.connection} mu : float Control invertion for J.T * J matrix, defaults to ``0.1``. mu_update_factor : float Factor to decrease the mu if update decrese the error, otherwise increse mu by the same factor. Defaults to ``1.2`` error : {{``mse``}} Levenberg-Marquardt works only for quadratic functions. Defaults to ``mse``. {BaseGradientDescent.show_epoch} {BaseGradientDescent.shuffle_data} {BaseGradientDescent.epoch_end_signal} {BaseGradientDescent.train_end_signal} {BaseGradientDescent.verbose} {BaseGradientDescent.addons} Attributes ---------- {BaseGradientDescent.Attributes} Methods ------- {BaseGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> lmnet = algorithms.LevenbergMarquardt((2, 3, 1)) >>> lmnet.train(x_train, y_train) See Also -------- :network:`BaseGradientDescent` : BaseGradientDescent algorithm. """ mu = BoundedProperty(default=0.01, minval=0) mu_update_factor = BoundedProperty(default=1.2, minval=1) error = ChoiceProperty(default='mse', choices={'mse': errors.mse}) step = WithdrawProperty() def init_variables(self): super(LevenbergMarquardt, self).init_variables() self.variables.update( mu=tf.Variable(self.mu, name='lev-marq/mu'), last_error=tf.Variable(np.nan, name='lev-marq/last-error'), ) def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = tf.where( tf.less(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) err_for_each_sample = flatten((network_output - prediction_func) ** 2) params = parameter_values(self.connection) param_vector = make_single_vector(params) J = compute_jacobian(err_for_each_sample, params) J_T = tf.transpose(J) n_params = J.shape[1] parameter_update = tf.matrix_solve( tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value), tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1)) ) updated_params = param_vector - flatten(parameter_update) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates def on_epoch_start_update(self, epoch): super(LevenbergMarquardt, self).on_epoch_start_update(epoch) last_error = self.errors.last() if last_error is not None: self.variables.last_error.load(last_error, tensorflow_session())
class PNN(BaseSkeleton): """ Probabilistic Neural Network (PNN). Network applies only to the classification problems. Notes ----- - PNN Network is sensitive for cases when one input feature has higher values than the other one. Input data has to be normalized before training. - Standard deviation has to match the range of the input features Check ``std`` parameter description for more information. - The bigger training dataset the slower prediction. Algorithm is much more efficient for small datasets. - Network uses lazy learning which mean that network doesn't need iterative training. It just stores parameters and use them to make a predictions. Parameters ---------- std : float Standard deviation for the Probability Density Function (PDF). If your input features have high values than standard deviation should also be high. For instance, if input features from range ``[0, 20]`` that standard deviation should be also a big value like ``10`` or ``15``. Small values will lead to bad prediction. batch_size : int or None Set up min-batch size. The ``None`` value will ensure that all data samples will be propagated through the network at once. Defaults to ``128``. {Verbose.verbose} Methods ------- train(X_train, y_train, copy=True) Network just stores all the information about the data and use it for the prediction. Parameter ``copy`` copies input data before saving it inside the network. The ``y_train`` argument should be a vector or matrix with one feature column. predict(X) Return classes associated with each sample in the ``X``. predict_proba(X) Predict probabilities for each class. {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> >>> from sklearn import datasets, metrics >>> from sklearn.model_selection import train_test_split >>> from neupy import algorithms >>> >>> dataset = datasets.load_digits() >>> x_train, x_test, y_train, y_test = train_test_split( ... dataset.data, dataset.target, test_size=0.3 ... ) >>> >>> pnn = algorithms.PNN(std=10, verbose=False) >>> pnn.train(x_train, y_train) >>> >>> y_predicted = pnn.predict(x_test) >>> metrics.accuracy_score(y_test, y_predicted) 0.98888888888888893 """ std = BoundedProperty(minval=0) batch_size = IntProperty(default=128, minval=0, allow_none=True) def __init__(self, std, batch_size=128, verbose=False): self.std = std self.batch_size = batch_size self.classes = None self.X_train = None self.y_train = None super(PNN, self).__init__(batch_size=batch_size, verbose=verbose) def train(self, X_train, y_train, copy=True): """ Trains network. PNN doesn't actually train, it just stores input data and use it for prediction. Parameters ---------- X_train : array-like (n_samples, n_features) y_train : array-like (n_samples,) Target variable should be vector or matrix with one feature column. copy : bool If value equal to ``True`` than input matrices will be copied. Defaults to ``True``. Raises ------ ValueError In case if something is wrong with input data. """ X_train = format_data(X_train, copy=copy) y_train = format_data(y_train, copy=copy, make_float=False) self.X_train = X_train self.y_train = y_train if X_train.shape[0] != y_train.shape[0]: raise ValueError( "Number of samples in the input and " "target datasets are different") if y_train.shape[1] != 1: raise ValueError( "Target value should be vector or " "matrix with only one column") classes = self.classes = np.unique(y_train) n_classes = classes.size n_samples = X_train.shape[0] class_ratios = self.class_ratios = np.zeros(n_classes) row_comb_matrix = self.row_comb_matrix = np.zeros( (n_classes, n_samples)) for i, class_name in enumerate(classes): class_name = classes[i] class_val_positions = (y_train == class_name) row_comb_matrix[i, class_val_positions.ravel()] = 1 class_ratios[i] = np.sum(class_val_positions) def predict_proba(self, X): """ Predict probabilities for each class. Parameters ---------- X : array-like (n_samples, n_features) Returns ------- array-like (n_samples, n_classes) """ outputs = iters.apply_batches( function=self.predict_raw, inputs=format_data(X), batch_size=self.batch_size, show_progressbar=self.logs.enable, ) raw_output = np.concatenate(outputs, axis=1) total_output_sum = raw_output.sum(axis=0).reshape((-1, 1)) return raw_output.T / total_output_sum def predict_raw(self, X): """ Raw prediction. Parameters ---------- X : array-like (n_samples, n_features) Raises ------ NotTrained If network hasn't been trained. ValueError In case if something is wrong with input data. Returns ------- array-like (n_samples, n_classes) """ if self.classes is None: raise NotTrained( "Cannot make a prediction. Network hasn't been trained yet") if X.shape[1] != self.X_train.shape[1]: raise ValueError( "Input data must contain {0} features, got {1}" "".format(self.X_train.shape[1], X.shape[1])) class_ratios = self.class_ratios.reshape((-1, 1)) pdf_outputs = pdf_between_data(self.X_train, X, self.std) return np.dot(self.row_comb_matrix, pdf_outputs) / class_ratios def predict(self, X): """ Predicts class from the input data. Parameters ---------- X : array-like (n_samples, n_features) Returns ------- array-like (n_samples,) """ outputs = iters.apply_batches( function=self.predict_raw, inputs=format_data(X), batch_size=self.batch_size, show_progressbar=self.logs.enable, ) raw_output = np.concatenate(outputs, axis=1) return self.classes[raw_output.argmax(axis=0)]
class WeightElimination(WeightUpdateConfigurable): """ Weight Elimination algorithm penalizes large weights and limits the freedom in network. The algorithm is able to solve one of the possible problems of network overfitting. Parameters ---------- decay_rate : float Controls the effect of penalties on the update network weights. Defaults to ``0.1``. zero_weight : float Second important parameter for weights penalization. Defaults to ``1``. Small value can make all weights close to zero. Big value will make less significant contribution in weights update. Which mean that with a bigger value of the ``zero_weight`` parameter network allows higher values for the weights. Warns ----- {WeightUpdateConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... decay_rate=0.1, ... addons=[algorithms.WeightElimination] ... ) See Also -------- :network:`WeightDecay` : Weight Decay penalty. Notes ----- Before adding that regularization parameter carefully choose ``decay_rate`` and ``zero_weight`` parameters for the problem. Invalid parameters can make weight very close to the origin (all values become close to zero). References ---------- [1] Weigend, A. S.; Rumelhart, D. E. & Huberman, B. A. (1991), Generalization by Weight-Elimination with Application to Forecasting, in Richard P. Lippmann; John E. Moody & David S. Touretzky, ed., Advances in Neural Information Processing Systems, San Francisco, CA: Morgan Kaufmann, pp. 875--882 . """ decay_rate = BoundedProperty(default=0.1, minval=0) zero_weight = BoundedProperty(default=1, minval=0) def init_train_updates(self): original_updates = super(WeightElimination, self).init_train_updates() parameters = [param for _, _, param in iter_parameters(self.layers)] modified_updates = [] step = self.variables.step decay_koef = asfloat(self.decay_rate * step) zero_weight_square = asfloat(self.zero_weight**2) for parameter, updated in original_updates: if parameter in parameters: updated -= decay_koef * ( (2 * parameter / zero_weight_square) / tf.square(1 + tf.square(parameter) / zero_weight_square)) modified_updates.append((parameter, updated)) return modified_updates
class HebbRule(BaseStepAssociative): """ Neural Network with Hebbian Learning. It's an unsupervised algorithm. Network can learn associations from the data. Notes ----- - Network always generates weights that contains ``0`` weight for the conditioned stimulus and ``1`` for the other. Such initialization helps to control your default state for the feature learning. Parameters ---------- decay_rate : float Decay rate controls network's weights. It helps network to 'forget' information and control weight's size. Without this parameter network's weights will increase fast. Defaults to ``0.2``. {BaseStepAssociative.Parameters} Methods ------- {BaseStepAssociative.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> pavlov_dog_data = np.array([ ... [1, 0], # food, no bell ... [1, 1], # food, bell ... ]) >>> dog_test_cases = np.array([ ... [0, 0], # no food, no bell ... [0, 1], # no food, bell ... [1, 0], # food, no bell ... [1, 1], # food, bell ... ]) >>> >>> hebbnet = algorithms.HebbRule( ... n_inputs=2, ... n_outputs=1, ... n_unconditioned=1, ... step=0.1, ... decay_rate=0.8, ... verbose=False ... ) >>> hebbnet.train(pavlov_dog_data, epochs=2) >>> hebbnet.predict(dog_test_cases) array([[0], [1], [1], [1]]) """ decay_rate = BoundedProperty(default=0.2, minval=0) def weight_delta(self, input_row, layer_output): n_unconditioned = self.n_unconditioned weight = self.weight[n_unconditioned:, :] delta = input_row[:, n_unconditioned:].T.dot(layer_output) return -self.decay_rate * weight + self.step * delta
class ErrDiffStepUpdate(SingleStepConfigurable): """ This algorithm make step update base on error difference between epochs. Parameters ---------- update_for_smaller_error : float Multiplies this option to ``step`` in if the error was less than in previous epochs. Defaults to ``1.05``. Value can't be less than ``1``. update_for_bigger_error : float Multiplies this option to ``step`` in if the error was more than in previous epochs. Defaults to ``0.7``. error_difference : float The value indicates how many had to increase the error from the previous epochs that would produce reduction step. Defaults to ``1.04``. Value can't be less than ``1``. Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... verbose=False, ... addons=[algorithms.ErrDiffStepUpdate] ... ) """ update_for_smaller_error = BoundedProperty(default=1.05, minval=1) update_for_bigger_error = ProperFractionProperty(default=0.7) error_difference = BoundedProperty(default=1.04, minval=1) def init_variables(self): self.variables.update( last_error=tf.Variable( np.nan, name='err-diff-step-update/last-error', ), previous_error=tf.Variable( np.nan, name='err-diff-step-update/previous-error', ), ) super(ErrDiffStepUpdate, self).init_variables() def init_train_updates(self): updates = super(ErrDiffStepUpdate, self).init_train_updates() step = self.variables.step last_error = self.variables.last_error previous_error = self.variables.previous_error step_update_condition = tf.where( last_error < previous_error, self.update_for_smaller_error * step, tf.where( last_error > self.update_for_bigger_error * previous_error, self.update_for_bigger_error * step, step ) ) updates.append((step, step_update_condition)) return updates def on_epoch_start_update(self, epoch): super(ErrDiffStepUpdate, self).on_epoch_start_update(epoch) previous_error = self.errors.previous() if previous_error: session = tensorflow_session() last_error = self.errors.last() self.variables.last_error.load(last_error, session) self.variables.previous_error.load(previous_error, session)
class LevenbergMarquardt(BaseOptimizer): """ Levenberg-Marquardt algorithm is a variation of the Newton's method. It minimizes MSE error. The algorithm approximates Hessian matrix using dot product between two jacobian matrices. Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. - Network minimizes only Mean Squared Error (MSE) loss function. - Efficient for small training datasets, because it computes gradient per each sample separately. - Efficient for small-sized networks. Parameters ---------- {BaseOptimizer.network} mu : float Control invertion for J.T * J matrix, defaults to ``0.1``. mu_update_factor : float Factor to decrease the mu if update decrese the error, otherwise increse mu by the same factor. Defaults to ``1.2`` error : {{``mse``}} Levenberg-Marquardt works only for quadratic functions. Defaults to ``mse``. {BaseOptimizer.show_epoch} {BaseOptimizer.shuffle_data} {BaseOptimizer.signals} {BaseOptimizer.verbose} Attributes ---------- {BaseOptimizer.Attributes} Methods ------- {BaseOptimizer.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.LevenbergMarquardt(network) >>> optimizer.train(x_train, y_train) See Also -------- :network:`BaseOptimizer` : BaseOptimizer algorithm. """ mu = BoundedProperty(default=0.01, minval=0) mu_update_factor = BoundedProperty(default=1.2, minval=1) loss = ChoiceProperty(default='mse', choices={'mse': objectives.mse}) step = WithdrawProperty() regularizer = WithdrawProperty() def init_functions(self): self.variables.update( mu=tf.Variable(self.mu, name='lev-marq/mu'), last_error=tf.Variable(np.nan, name='lev-marq/last-error'), ) super(LevenbergMarquardt, self).init_functions() def init_train_updates(self): training_outputs = self.network.training_outputs last_error = self.variables.last_error error_func = self.variables.loss mu = self.variables.mu new_mu = tf.where( tf.less(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) err_for_each_sample = flatten((self.target - training_outputs)**2) variables = self.network.variables params = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(params) J = compute_jacobian(err_for_each_sample, params) J_T = tf.transpose(J) n_params = J.shape[1] parameter_update = tf.matrix_solve( tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value), tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1))) updated_params = param_vector - flatten(parameter_update) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates def one_training_update(self, X_train, y_train): if self.errors.train: last_error = self.errors.train[-1] self.variables.last_error.load(last_error, tensorflow_session()) return super(LevenbergMarquardt, self).one_training_update(X_train, y_train)
class LeakStepAdaptation(SingleStepConfigurable): """ Leak Learning Rate Adaptation algorithm for step adaptation procedure in backpropagation algortihm. By default every layer has the same value as ``step`` parameter in network, but after first training epoch they must be different. Parameters ---------- leak_size : float Defaults to ``0.01``. This variable identified proportion, so it's always between 0 and 1. Usualy this value is small. alpha : float The ``alpha`` is control total step update ratio (It's similar to step role in weight update procedure). Defaults to ``0.001``. Typical this value is small. beta : float This similar to ``alpha``, but it control ration only for update matrix norms. Defaults to ``20``. Typical this value is > 1. beta : float Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... addons=[algorithms.LeakStepAdaptation] ... ) >>> .. [1] Noboru M. "Adaptive on-line learning in changing environments", 1997 .. [2] LeCun, "Efficient BackProp", 1998 """ leak_size = ProperFractionProperty(default=0.01) alpha = BoundedProperty(default=0.001, minval=0) beta = BoundedProperty(default=20, minval=0) def init_variables(self): super(LeakStepAdaptation, self).init_variables() n_parameters = count_parameters(self) self.variables.leak_average = theano.shared(value=asfloat( np.zeros(n_parameters)), name='leak_average') def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = self.alpha beta = self.beta leak_size = self.leak_size step = self.variables.step leak_average = self.variables.leak_average parameters = list(iter_parameters(self)) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) leak_avarage_update = ((1 - leak_size) * leak_average + leak_size * full_gradient) new_step = step + alpha * step * ( beta * leak_avarage_update.norm(L=2) - step) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
class PNN(BaseNetwork, LazyLearningMixin, MinibatchTrainingMixin): """ Probabilistic Neural Network (PNN). Network applies only to the classification problems. Notes ----- - PNN Network is sensitive for cases when one input feature has higher values than the other one. Before use it make sure that input values are normalized and have similar scales. - Make sure that standard deviation in the same range as input features. Check ``std`` parameter description for more information. - The bigger training dataset the slower prediction. It's much more efficient for small datasets. {LazyLearningMixin.Notes} Parameters ---------- std : float Standard deviation for the Probability Density Function (PDF). Defaults to ``0.1``. If your input features have high values than standard deviation should also be high. For instance, if input features from range ``[0, 20]`` that standard deviation should be also a big value like ``10`` or ``15``. Small values will lead to bad prediction. {MinibatchTrainingMixin.batch_size} {BaseNetwork.verbose} Methods ------- {LazyLearningMixin.train} The ``target_train`` argument should be a vector or matrix with one feature column. {BaseSkeleton.predict} predict_proba(input_data) Predict probabilities for each class. {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> >>> from sklearn import datasets, metrics >>> from sklearn.model_selection import train_test_split >>> from neupy import algorithms, environment >>> >>> environment.reproducible() >>> >>> dataset = datasets.load_digits() >>> x_train, x_test, y_train, y_test = train_test_split( ... dataset.data, dataset.target, train_size=0.7 ... ) >>> >>> pnn = algorithms.PNN(std=10, verbose=False) >>> pnn.train(x_train, y_train) >>> >>> y_predicted = pnn.predict(x_test) >>> metrics.accuracy_score(y_test, y_predicted) 0.98888888888888893 """ std = BoundedProperty(default=0.1, minval=0) def __init__(self, **options): super(PNN, self).__init__(**options) self.classes = None def train(self, input_train, target_train, copy=True): """ Trains network. PNN doesn't actually train, it just stores input data and use it for prediction. Parameters ---------- input_train : array-like (n_samples, n_features) target_train : array-like (n_samples,) Target variable should be vector or matrix with one feature column. copy : bool If value equal to ``True`` than input matrices will be copied. Defaults to ``True``. Raises ------ ValueError In case if something is wrong with input data. """ input_train = format_data(input_train, copy=copy) target_train = format_data(target_train, copy=copy, make_float=False) LazyLearningMixin.train(self, input_train, target_train) n_target_features = target_train.shape[1] if n_target_features != 1: raise ValueError("Target value should be a vector or a " "matrix with one column") classes = self.classes = np.unique(target_train) n_classes = classes.size n_samples = input_train.shape[0] class_ratios = self.class_ratios = np.zeros(n_classes) row_comb_matrix = self.row_comb_matrix = np.zeros( (n_classes, n_samples)) for i, class_name in enumerate(classes): class_name = classes[i] class_val_positions = (target_train == class_name) row_comb_matrix[i, class_val_positions.ravel()] = 1 class_ratios[i] = np.sum(class_val_positions) def predict_proba(self, input_data): """ Predict probabilities for each class. Parameters ---------- input_data : array-like (n_samples, n_features) Returns ------- array-like (n_samples, n_classes) """ outputs = self.apply_batches( function=self.predict_raw, input_data=format_data(input_data), description='Prediction batches', show_progressbar=True, show_error_output=False, ) raw_output = np.concatenate(outputs, axis=1) total_output_sum = raw_output.sum(axis=0).reshape((-1, 1)) return raw_output.T / total_output_sum def predict_raw(self, input_data): """ Raw prediction. Parameters ---------- input_data : array-like (n_samples, n_features) Raises ------ NotTrained If network hasn't been trained. ValueError In case if something is wrong with input data. Returns ------- array-like (n_samples, n_classes) """ if self.classes is None: raise NotTrained("Cannot make a prediction. Network " "hasn't been trained yet") input_data_size = input_data.shape[1] train_data_size = self.input_train.shape[1] if input_data_size != train_data_size: raise ValueError("Input data must contain {0} features, got " "{1}".format(train_data_size, input_data_size)) class_ratios = self.class_ratios.reshape((-1, 1)) pdf_outputs = pdf_between_data(self.input_train, input_data, self.std) return np.dot(self.row_comb_matrix, pdf_outputs) / class_ratios def predict(self, input_data): """ Predicts class from the input data. Parameters ---------- input_data : array-like (n_samples, n_features) Returns ------- array-like (n_samples,) """ outputs = self.apply_batches( function=self.predict_raw, input_data=format_data(input_data), description='Prediction batches', show_progressbar=True, show_error_output=False, ) raw_output = np.concatenate(outputs, axis=1) return self.classes[raw_output.argmax(axis=0)]