class Dropout(BaseLayer): """ Dropout layer Parameters ---------- proba : float Fraction of the input units to drop. Value needs to be between 0 and 1. """ proba = ProperFractionProperty(required=True) def __init__(self, proba, **options): options['proba'] = proba super(Dropout, self).__init__(**options) @property def size(self): return self.relate_to_layer.size def output(self, input_value): # Use NumPy seed to make Theano code easely reproducible max_possible_seed = 4e9 seed = np.random.randint(max_possible_seed) theano_random = T.shared_randomstreams.RandomStreams(seed) proba = (1.0 - self.proba) mask = theano_random.binomial(n=1, p=proba, size=input_value.shape, dtype=input_value.dtype) return (mask * input_value) / proba def __repr__(self): return "{name}(proba={proba})".format(name=self.__class__.__name__, proba=self.proba)
class Dropout(BaseLayer): """ Dropout layer Parameters ---------- proba : float Fraction of the input units to drop. Value needs to be between ``0`` and ``1``. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} """ proba = ProperFractionProperty(required=True) def __init__(self, proba, **options): super(Dropout, self).__init__(proba=proba, **options) def output(self, input_value): if not self.training_state: return input_value return tf.nn.dropout(input_value, keep_prob=(1.0 - self.proba)) def __repr__(self): classname = self.__class__.__name__ return "{}(proba={})".format(classname, self.proba)
class Momentum(MinibatchGradientDescent): """ Momentum algorithm. Parameters ---------- momentum : float Control previous gradient ratio. Defaults to ``0.9``. nesterov : bool Instead of classic momentum computes Nesterov momentum. Defaults to ``False``. {MinibatchGradientDescent.Parameters} Attributes ---------- {MinibatchGradientDescent.Attributes} Methods ------- {MinibatchGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Momentum((2, 3, 1)) >>> mnet.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ momentum = ProperFractionProperty(default=0.9) nesterov = Property(default=False, expected_type=bool) def init_param_updates(self, layer, parameter): step = self.variables.step parameter_shape = parameter.get_value().shape previous_velocity = theano.shared( name="{}/previous-velocity".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) velocity = self.momentum * previous_velocity - step * gradient if self.nesterov: velocity = self.momentum * velocity - step * gradient return [ (parameter, parameter + velocity), (previous_velocity, velocity), ]
class Dropout(BaseLayer): """ Dropout layer Parameters ---------- proba : float Fraction of the input units to drop. Value needs to be between 0 and 1. """ proba = ProperFractionProperty(required=True) def __init__(self, proba, **options): options['proba'] = proba super(Dropout, self).__init__(**options) @property def size(self): return self.relate_to_layer.size def output(self, input_value): if not self.training_state: return input_value theano_random = theano_random_stream() proba = (1.0 - self.proba) mask = theano_random.binomial(n=1, p=proba, size=input_value.shape, dtype=input_value.dtype) return (mask * input_value) / proba def __repr__(self): classname = self.__class__.__name__ return "{}(proba={})".format(classname, self.proba)
class RMSProp(MinibatchGradientDescent): """ RMSProp algorithm. Parameters ---------- decay : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. {MinibatchGradientDescent.Parameters} Attributes ---------- {MinibatchGradientDescent.Attributes} Methods ------- {MinibatchGradientDescent.Methods} """ decay = ProperFractionProperty(default=0.95) epsilon = NumberProperty(default=1e-5, minval=0) def init_layers(self): super(RMSProp, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_mean_squred_grad = theano.shared( name="prev_mean_squred_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) prev_mean_squred_grad = parameter.prev_mean_squred_grad step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
class RMSProp(MinibatchGradientDescent): """ RMSProp algorithm. Parameters ---------- decay : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. {MinibatchGradientDescent.Parameters} Attributes ---------- {MinibatchGradientDescent.Attributes} Methods ------- {MinibatchGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.RMSProp((2, 3, 1)) >>> mnet.train(x_train, y_train) """ decay = ProperFractionProperty(default=0.95) epsilon = NumberProperty(default=1e-5, minval=0) def init_param_updates(self, layer, parameter): step = self.variables.step parameter_shape = T.shape(parameter).eval() prev_mean_squred_grad = theano.shared( name="{}/prev-mean-squared-grad".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
class RMSProp(MinibatchGradientDescent): """ RMSProp algorithm. Parameters ---------- decay : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. {MinibatchGradientDescent.batch_size} {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} """ decay = ProperFractionProperty(default=0.95) epsilon = NumberProperty(default=1e-5, minval=0) def init_layers(self): super(RMSProp, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_mean_squred_grad = theano.shared( name="prev_mean_squred_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): prev_mean_squred_grad = parameter.prev_mean_squred_grad step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
class Momentum(GradientDescent): """ Momentum algorithm. Parameters ---------- momentum : float Control previous gradient ratio. Defaults to ``0.9``. nesterov : bool Instead of classic momentum computes Nesterov momentum. Defaults to ``False``. {GradientDescent.Parameters} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.Momentum(network) >>> optimizer.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ momentum = ProperFractionProperty(default=0.9) nesterov = Property(default=False, expected_type=bool) def init_train_updates(self): optimizer = tf.train.MomentumOptimizer( use_nesterov=self.nesterov, momentum=self.momentum, learning_rate=self.step, ) self.functions.optimizer = optimizer return [optimizer.minimize(self.variables.loss)]
class Dropout(Identity): """ Dropout layer. It randomly switches of (multiplies by zero) input values, where probability to be switched per each value can be controlled with the ``proba`` parameter. For example, ``proba=0.2`` will mean that only 20% of the input values will be multiplied by 0 and 80% of the will be unchanged. It's important to note that output from the dropout is controled by the ``training`` parameter in the ``output`` method. Droput will be applied only in cases when ``training=True`` propagated through the network, otherwise it will act as an identity. Parameters ---------- proba : float Fraction of the input units to drop. Value needs to be between ``0`` and ``1``. {Identity.name} Methods ------- {Identity.Methods} Attributes ---------- {Identity.Attributes} Examples -------- >>> from neupy.layers import * >>> network = join( ... Input(10), ... Relu(5) >> Dropout(0.5), ... Relu(5) >> Dropout(0.5), ... Sigmoid(1), ... ) >>> network (?, 10) -> [... 6 layers ...] -> (?, 1) """ proba = ProperFractionProperty() def __init__(self, proba, name=None): super(Dropout, self).__init__(name=name) self.proba = proba def output(self, input_value, training=False): if not training: return input_value return tf.nn.dropout(input_value, keep_prob=(1.0 - self.proba))
class StepOutput(Output): """ The behaviour for this layer is the same as for step function. Parameters ---------- output_bounds : tuple Value is must be a tuple which contains two elements where first one identify lower output value and the second one - bigger. Defaults to ``(0, 1)``. critical_point : float Critical point is set up step function bias. Value equal to this point should be equal to the lower bound. Defaults to ``0``. {Output.size} """ output_bounds = TypedListProperty(default=(0, 1)) critical_point = ProperFractionProperty(default=0) def output(self, value): lower_bound, upper_bound = self.output_bounds return np.where(value <= self.critical_point, lower_bound, upper_bound)
class Dropout(BaseLayer): """ Dropout layer Parameters ---------- proba : float Fraction of the input units to drop. Value needs to be between ``0`` and ``1``. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} """ proba = ProperFractionProperty(required=True) def __init__(self, proba, **options): super(Dropout, self).__init__(proba=proba, **options) def output(self, input_value): if not self.training_state: return input_value theano_random = theano_random_stream() proba = (1.0 - self.proba) mask = theano_random.binomial(n=1, p=proba, size=input_value.shape, dtype=input_value.dtype) return (mask * input_value) / proba def __repr__(self): classname = self.__class__.__name__ return "{}(proba={})".format(classname, self.proba)
class ErrDiffStepUpdate(SingleStepConfigurable): """ This algorithm make step update base on error difference between epochs. Parameters ---------- update_for_smaller_error : float Multiplies this option to ``step`` in if the error was less than in previous epochs. Defaults to ``1.05``. Value can't be less than ``1``. update_for_bigger_error : float Multiplies this option to ``step`` in if the error was more than in previous epochs. Defaults to ``0.7``. error_difference : float The value indicates how many had to increase the error from the previous epochs that would produce reduction step. Defaults to ``1.04``. Value can't be less than ``1``. Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... verbose=False, ... addons=[algorithms.ErrDiffStepUpdate] ... ) """ update_for_smaller_error = BoundedProperty(default=1.05, minval=1) update_for_bigger_error = ProperFractionProperty(default=0.7) error_difference = BoundedProperty(default=1.04, minval=1) def init_variables(self): self.variables.update( last_error=tf.Variable( np.nan, name='err-diff-step-update/last-error', ), previous_error=tf.Variable( np.nan, name='err-diff-step-update/previous-error', ), ) super(ErrDiffStepUpdate, self).init_variables() def init_train_updates(self): updates = super(ErrDiffStepUpdate, self).init_train_updates() step = self.variables.step last_error = self.variables.last_error previous_error = self.variables.previous_error step_update_condition = tf.where( last_error < previous_error, self.update_for_smaller_error * step, tf.where( last_error > self.update_for_bigger_error * previous_error, self.update_for_bigger_error * step, step ) ) updates.append((step, step_update_condition)) return updates def on_epoch_start_update(self, epoch): super(ErrDiffStepUpdate, self).on_epoch_start_update(epoch) previous_error = self.errors.previous() if previous_error: session = tensorflow_session() last_error = self.errors.last() self.variables.last_error.load(last_error, session) self.variables.previous_error.load(previous_error, session)
class Momentum(MinibatchGradientDescent): """ Momentum algorithm for :network:`GradientDescent` optimization. Parameters ---------- momentum : float Control previous gradient ratio. Defaults to ``0.9``. nesterov : bool Instead of classic momentum computes Nesterov momentum. Defaults to ``False``. {MinibatchGradientDescent.batch_size} {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} Examples -------- Simple example >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Momentum( ... (2, 3, 1), ... verbose=False ... ) >>> mnet.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ momentum = ProperFractionProperty(default=0.9) nesterov = Property(default=False, expected_type=bool) def init_layers(self): super(Momentum, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_param_delta = theano.shared( name="prev_param_delta_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) prev_param_delta = parameter.prev_param_delta parameter_delta = self.momentum * prev_param_delta - step * gradient if self.nesterov: parameter_delta = self.momentum * parameter_delta - step * gradient return [ (parameter, parameter + parameter_delta), (prev_param_delta, parameter_delta), ]
class GrowingNeuralGas(BaseNetwork): """ Growing Neural Gas (GNG) algorithm. Current algorithm has two modifications that hasn't been mentioned in the paper, but they help to speed up training. - The ``n_start_nodes`` parameter provides possibility to increase number of nodes during initialization step. It's useful when algorithm takes a lot of time building up large amount of neurons. - The ``min_distance_for_update`` parameter allows to speed up training when some data samples has neurons very close to them. The ``min_distance_for_update`` parameter controls threshold for the minimum distance for which we will want to update weights. Parameters ---------- n_inputs : int Number of features in each sample. n_start_nodes : int Number of nodes that algorithm generates from the data during the initialization step. Defaults to ``2``. step : float Step (learning rate) for the neuron winner. Defaults to ``0.2``. neighbour_step : float Step (learning rate) for the neurons that connected via edges with neuron winner. This value typically has to be smaller than ``step`` value. Defaults to ``0.05``. max_edge_age : int It means that if edge won't be updated for ``max_edge_age`` iterations than it would be removed. The larger the value the more updates we allow to do before removing edge. Defaults to ``100``. n_iter_before_neuron_added : int Each ``n_iter_before_neuron_added`` weight update algorithm add new neuron. The smaller the value the more frequently algorithm adds new neurons to the network. Defaults to ``1000``. error_decay_rate : float This error decay rate would be applied to every neuron in the graph after each training iteration. It ensures that old errors will be reduced over time. Defaults to ``0.995``. after_split_error_decay_rate : float This decay rate reduces error for neurons with largest errors after algorithm added new neuron. This value typically lower than ``error_decay_rate``. Defaults to ``0.5``. max_nodes : int Maximum number of nodes that would be generated during the training. This parameter won't stop training when maximum number of nodes will be exceeded. Defaults to ``1000``. min_distance_for_update : float Parameter controls for which neurons we want to apply updates. In case if euclidean distance between data sample and closest neurons will be less than the ``min_distance_for_update`` value than update would be skipped for this data sample. Setting value to zero will disable effect provided by this parameter. Defaults to ``0``. {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.signals} {Verbose.verbose} Methods ------- train(X_train, epochs=100) Network learns topological structure of the data. Learned structure will be stored in the ``graph`` attribute. {BaseSkeleton.fit} initialize_nodes(data) Network initializes nodes randomly sampling ``n_start_nodes`` from the data. It would be applied automatically before the training in case if graph is empty. Note: Node re-initialization can reset network. Notes ----- - Unlike other algorithms this network doesn't make predictions. Instead, it learns topological structure of the data in form of the graph. After that training, structure of the network can be extracted from the ``graph`` attribute. - In order to speed up training, it might be useful to increase the ``n_start_nodes`` parameter. - During the training it happens that nodes learn topological structure of one part of the data better than the other, mostly because of the different data sample density in different places. Increasing the ``min_distance_for_update`` can speed up training ignoring updates for the neurons that very close to the data sample. (below specified ``min_distance_for_update`` value). Training can be stopped in case if none of the neurons has been updated during the training epoch. Attributes ---------- graph : NeuralGasGraph instance This attribute stores all neurons and connections between them in the form of undirected graph. {BaseNetwork.Attributes} Examples -------- >>> from neupy import algorithms >>> from sklearn.datasets import make_blobs >>> >>> data, _ = make_blobs( ... n_samples=1000, ... n_features=2, ... centers=2, ... cluster_std=0.4, ... ) >>> >>> neural_gas = algorithms.GrowingNeuralGas( ... n_inputs=2, ... shuffle_data=True, ... verbose=True, ... max_edge_age=10, ... n_iter_before_neuron_added=50, ... max_nodes=100, ... ) >>> neural_gas.graph.n_nodes 100 >>> len(neural_gas.graph.edges) 175 >>> edges = list(neural_gas.graph.edges.keys()) >>> neuron_1, neuron_2 = edges[0] >>> >>> neuron_1.weight array([[-6.77166299, 2.4121606 ]]) >>> neuron_2.weight array([[-6.829309 , 2.27839633]]) References ---------- [1] A Growing Neural Gas Network Learns Topologies, Bernd Fritzke """ n_inputs = IntProperty(minval=1, required=True) n_start_nodes = IntProperty(minval=2, default=2) step = NumberProperty(default=0.2, minval=0) neighbour_step = NumberProperty(default=0.05, minval=0) max_edge_age = IntProperty(default=100, minval=1) max_nodes = IntProperty(default=1000, minval=1) n_iter_before_neuron_added = IntProperty(default=1000, minval=1) after_split_error_decay_rate = ProperFractionProperty(default=0.5) error_decay_rate = ProperFractionProperty(default=0.995) min_distance_for_update = NumberProperty(default=0.0, minval=0) def __init__(self, *args, **kwargs): super(GrowingNeuralGas, self).__init__(*args, **kwargs) self.n_updates = 0 self.graph = NeuralGasGraph() def format_input_data(self, X): is_feature1d = self.n_inputs == 1 X = format_data(X, is_feature1d) if X.ndim != 2: raise ValueError("Cannot make prediction, because input " "data has more than 2 dimensions") n_samples, n_features = X.shape if n_features != self.n_inputs: raise ValueError("Input data expected to have {} features, " "but got {}".format(self.n_inputs, n_features)) return X def initialize_nodes(self, data): self.graph = NeuralGasGraph() for sample in sample_data_point(data, n=self.n_start_nodes): self.graph.add_node(NeuronNode(sample.reshape(1, -1))) def train(self, X_train, epochs=100): X_train = self.format_input_data(X_train) if not self.graph.nodes: self.initialize_nodes(X_train) return super(GrowingNeuralGas, self).train( X_train=X_train, y_train=None, X_test=None, y_test=None, epochs=epochs) def one_training_update(self, X_train, y_train=None): graph = self.graph step = self.step neighbour_step = self.neighbour_step max_nodes = self.max_nodes max_edge_age = self.max_edge_age error_decay_rate = self.error_decay_rate after_split_error_decay_rate = self.after_split_error_decay_rate n_iter_before_neuron_added = self.n_iter_before_neuron_added # We square this value, because we deal with # squared distances during the training. min_distance_for_update = np.square(self.min_distance_for_update) n_samples = len(X_train) total_error = 0 did_update = False for sample in X_train: nodes = graph.nodes weights = np.concatenate([node.weight for node in nodes]) distance = np.linalg.norm(weights - sample, axis=1) neuron_ids = np.argsort(distance) closest_neuron_id, second_closest_id = neuron_ids[:2] closest_neuron = nodes[closest_neuron_id] second_closest = nodes[second_closest_id] total_error += distance[closest_neuron_id] if distance[closest_neuron_id] < min_distance_for_update: continue self.n_updates += 1 did_update = True closest_neuron.error += distance[closest_neuron_id] closest_neuron.weight += step * (sample - closest_neuron.weight) graph.add_edge(closest_neuron, second_closest) for to_neuron in list(graph.edges_per_node[closest_neuron]): edge_id = graph.find_edge_id(to_neuron, closest_neuron) age = graph.edges[edge_id] if age >= max_edge_age: graph.remove_edge(to_neuron, closest_neuron) if not graph.edges_per_node[to_neuron]: graph.remove_node(to_neuron) else: graph.edges[edge_id] += 1 to_neuron.weight += neighbour_step * ( sample - to_neuron.weight) time_to_add_new_neuron = ( self.n_updates % n_iter_before_neuron_added == 0 and graph.n_nodes < max_nodes) if time_to_add_new_neuron: nodes = graph.nodes largest_error_neuron = max(nodes, key=attrgetter('error')) neighbour_neuron = max( graph.edges_per_node[largest_error_neuron], key=attrgetter('error')) largest_error_neuron.error *= after_split_error_decay_rate neighbour_neuron.error *= after_split_error_decay_rate new_weight = 0.5 * ( largest_error_neuron.weight + neighbour_neuron.weight ) new_neuron = NeuronNode(weight=new_weight.reshape(1, -1)) graph.remove_edge(neighbour_neuron, largest_error_neuron) graph.add_node(new_neuron) graph.add_edge(largest_error_neuron, new_neuron) graph.add_edge(neighbour_neuron, new_neuron) for node in graph.nodes: node.error *= error_decay_rate if not did_update and min_distance_for_update != 0 and n_samples > 1: raise StopTraining( "Distance between every data sample and neurons, closest " "to them, is less then {}".format(min_distance_for_update)) return total_error / n_samples def predict(self, *args, **kwargs): raise NotImplementedError( "Growing Neural Gas algorithm doesn't make prediction. " "It only learns graph structure from the data " "(class has `graph` attribute). ")
class Adamax(MinibatchGradientDescent): """ AdaMax algorithm. Parameters ---------- beta1 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. beta2 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. step : float Learning rate, defaults to ``0.001``. {MinibatchGradientDescent.batch_size} {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} """ step = NumberProperty(default=0.001, minval=0) beta1 = ProperFractionProperty(default=0.9) beta2 = ProperFractionProperty(default=0.999) epsilon = NumberProperty(default=1e-8, minval=0) def init_layers(self): super(Adamax, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_first_moment = theano.shared( name="prev_first_moment_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_weighted_inf_norm = theano.shared( name="prev_weighted_inf_norm_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): epoch = self.variables.epoch prev_first_moment = parameter.prev_first_moment prev_weighted_inf_norm = parameter.prev_weighted_inf_norm step = self.variables.step beta1 = self.beta1 beta2 = self.beta2 gradient = T.grad(self.variables.error_func, wrt=parameter) first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm, T.abs_(gradient)) parameter_delta = ((1 / (1 - beta1**epoch)) * (first_moment / (weighted_inf_norm + self.epsilon))) return [ (prev_first_moment, first_moment), (prev_weighted_inf_norm, weighted_inf_norm), (parameter, parameter - step * parameter_delta), ]
class BatchNorm(BaseLayer): """ Batch-normalization layer. Parameters ---------- axes : int, tuple with int or None The axis or axes along which normalization is applied. ``None`` means that normalization will be applied over all axes except the first one. In case of 4D tensor it will be equal to ``(0, 2, 3)``. Defaults to ``None``. epsilon : float Epsilon is a positive constant that adds to the standard deviation to prevent the division by zero. Defaults to ``1e-5``. alpha : float Coefficient for the exponential moving average of batch-wise means and standard deviations computed during training; the closer to one, the more it will depend on the last batches seen. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. gamma : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. beta : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} References ---------- .. [1] Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, http://arxiv.org/pdf/1502.03167v3.pdf """ axes = AxesProperty(default=None) alpha = ProperFractionProperty(default=0.1) epsilon = NumberProperty(default=1e-5, minval=0) gamma = ParameterProperty(default=Constant(value=1)) beta = ParameterProperty(default=Constant(value=0)) def initialize(self): super(BatchNorm, self).initialize() input_shape = as_tuple(None, self.input_shape) ndim = len(input_shape) if self.axes is None: # If ndim == 4 then axes = (0, 2, 3) # If ndim == 2 then axes = (0,) self.axes = tuple(axis for axis in range(ndim) if axis != 1) if any(axis >= ndim for axis in self.axes): raise ValueError("Cannot apply batch normalization on the axis " "that doesn't exist.") opposite_axes = find_opposite_axes(self.axes, ndim) parameter_shape = [input_shape[axis] for axis in opposite_axes] if any(parameter is None for parameter in parameter_shape): unknown_dim_index = parameter_shape.index(None) raise ValueError("Cannot apply batch normalization on the axis " "with unknown size over the dimension #{} " "(0-based indeces).".format(unknown_dim_index)) self.running_mean = theano.shared( name='running_mean_{}'.format(self.layer_id), value=asfloat(np.zeros(parameter_shape))) self.running_inv_std = theano.shared( name='running_inv_std_{}'.format(self.layer_id), value=asfloat(np.ones(parameter_shape))) if isinstance(self.gamma, Initializer): self.gamma = self.gamma.sample(parameter_shape) if isinstance(self.beta, Initializer): self.beta = self.beta.sample(parameter_shape) self.gamma = theano.shared( name='gamma_{}'.format(self.layer_id), value=asfloat(self.gamma), ) self.beta = theano.shared( name='beta_{}'.format(self.layer_id), value=asfloat(self.beta), ) self.parameters = [self.gamma, self.beta] def output(self, input_value): epsilon = asfloat(self.epsilon) alpha = asfloat(self.alpha) gamma, beta = self.gamma, self.beta ndim = input_value.ndim axes = self.axes running_mean = self.running_mean running_inv_std = self.running_inv_std input_mean = input_value.mean(axes) input_var = input_value.var(axes) input_inv_std = T.inv(T.sqrt(input_var + epsilon)) self.updates = [ (running_inv_std, asfloat(1 - alpha) * running_inv_std + alpha * input_inv_std), (running_mean, asfloat(1 - alpha) * running_mean + alpha * input_mean) ] if not self.training_state: mean = running_mean inv_std = running_inv_std else: mean = input_mean inv_std = input_inv_std opposite_axes = find_opposite_axes(axes, ndim) beta = dimshuffle(beta, ndim, opposite_axes) gamma = dimshuffle(gamma, ndim, opposite_axes) mean = dimshuffle(mean, ndim, opposite_axes) inv_std = dimshuffle(inv_std, ndim, opposite_axes) normalized_value = (input_value - mean) * inv_std return gamma * normalized_value + beta
class Adam(GradientDescent): """ Adam algorithm. Parameters ---------- beta1 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. beta2 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. step : float Learning rate, defaults to ``0.001``. {GradientDescent.batch_size} {BaseGradientDescent.addons} {ConstructibleNetwork.connection} {ConstructibleNetwork.error} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} References ---------- [1] Diederik P. Kingma, Jimmy Lei Ba Adam: a Method for Stochastic Optimization. https://arxiv.org/pdf/1412.6980.pdf Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Adam((2, 3, 1)) >>> mnet.train(x_train, y_train) """ step = NumberProperty(default=0.001, minval=0) beta1 = ProperFractionProperty(default=0.9) beta2 = ProperFractionProperty(default=0.999) epsilon = NumberProperty(default=1e-7, minval=0) def init_variables(self): super(Adam, self).init_variables() self.variables.iteration = tf.Variable( asfloat(1), name='iteration', dtype=tf.float32, ) def init_train_updates(self): updates = [] iteration = self.variables.iteration step = self.variables.step # Since beta1 and beta2 are typically close to 1 and initial # values for first and second moments are close to zero the # initial estimates for these moments will be biased towards zero. # In order to solve this problem we need to correct this bias # by rescaling moments with large values during first updates # and vanishing this scaling factor more and more after every # update. # # Note that bias correction factor has been changed in order # to improve computational speed (suggestion from the original # paper). bias_correction = ( tf.sqrt(1. - self.beta2 ** iteration) / (1. - self.beta1 ** iteration) ) for layer, parameter, gradient in self.iter_params_and_grads(): prev_first_moment = tf.Variable( tf.zeros(parameter.shape), name="{}/prev-first-moment".format(parameter.op.name), dtype=tf.float32, ) prev_second_moment = tf.Variable( tf.zeros(parameter.shape), name="{}/prev-second-moment".format(parameter.op.name), dtype=tf.float32, ) first_moment = ( self.beta1 * prev_first_moment + (1. - self.beta1) * gradient ) second_moment = ( self.beta2 * prev_second_moment + (1. - self.beta2) * gradient ** 2 ) parameter_delta = bias_correction * first_moment / ( tf.sqrt(second_moment) + self.epsilon) updates.extend([ (prev_first_moment, first_moment), (prev_second_moment, second_moment), (parameter, parameter - step * parameter_delta), ]) updates.append((iteration, iteration + 1)) return updates
class A(Configurable): fraction = ProperFractionProperty()
class Adadelta(GradientDescent): """ Adadelta algorithm. Parameters ---------- rho : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-7``. step : float Learning rate, defaults to ``1.0``. Original paper doesn't have learning rate specified in the paper. Step value equal to ``1.0`` allow to achive the same effect, since multiplication by one won't have any effect on the update. {GradientDescent.batch_size} {BaseOptimizer.regularizer} {BaseOptimizer.network} {BaseOptimizer.loss} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.signals} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.Adadelta(network) >>> optimizer.train(x_train, y_train) References ---------- [1] Matthew D. Zeiler, ADADELTA: An Adaptive Learning Rate Method https://arxiv.org/pdf/1212.5701.pdf """ step = ScalarVariableProperty(default=1.0) rho = ProperFractionProperty(default=0.95) epsilon = NumberProperty(default=1e-7, minval=0) def init_train_updates(self): optimizer = tf.train.AdadeltaOptimizer( rho=self.rho, epsilon=self.epsilon, learning_rate=self.step, ) self.functions.optimizer = optimizer return [optimizer.minimize(self.variables.loss)]
class HessianDiagonal(BaseOptimizer): """ Hissian diagonal is a Hessian algorithm approximation which require only computation of hessian matrix diagonal elements and makes it invertion much easier and faster. Parameters ---------- min_eigval : float Set up minimum eigenvalue for Hessian diagonale matrix. After a few iteration elements will be extremly small and matrix inverse produce huge number in hessian diagonal elements. This parameter control diagonal elements size. Defaults to ``1e-2``. {BaseOptimizer.Parameters} Attributes ---------- {BaseOptimizer.Attributes} Methods ------- {BaseOptimizer.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.HessianDiagonal(network) >>> optimizer.train(x_train, y_train) Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. See Also -------- :network:`BaseOptimizer` : BaseOptimizer algorithm. :network:`Hessian` : Newton's method. """ min_eigval = ProperFractionProperty(default=1e-2) def init_train_updates(self): step = self.step inv_min_eigval = 1 / self.min_eigval variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.loss, parameters) full_gradient = make_single_vector(gradients) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative, = tf.gradients(gradient, parameter) second_derivatives.append(flatten(second_derivative)) hessian_diag = tf.concat(second_derivatives, axis=0) # it's easier to clip inverse hessian rather than the hessian,. inv_hessian_diag = tf.clip_by_value( # inverse for diagonal matrix easy to compute with # elementwise inverse operation. 1 / hessian_diag, -inv_min_eigval, inv_min_eigval, ) updates = setup_parameter_updates( parameters, param_vector - step * full_gradient * inv_hessian_diag) return updates
class Adam(MinibatchGradientDescent): """ Adam algorithm. Parameters ---------- beta1 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. beta2 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. step : float Learning rate, defaults to ``0.001``. {MinibatchGradientDescent.batch_size} {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} """ step = NumberProperty(default=0.001, minval=0) beta1 = ProperFractionProperty(default=0.9) beta2 = ProperFractionProperty(default=0.999) epsilon = NumberProperty(default=1e-7, minval=0) def init_layers(self): super(Adam, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_first_moment = theano.shared( name="prev_first_moment_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_second_moment = theano.shared( name="prev_second_moment_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): epoch = self.variables.epoch prev_first_moment = parameter.prev_first_moment prev_second_moment = parameter.prev_second_moment step = asfloat(self.variables.step) beta1 = asfloat(self.beta1) beta2 = asfloat(self.beta2) epsilon = asfloat(self.epsilon) gradient = T.grad(self.variables.error_func, wrt=parameter) first_moment = ( beta1 * prev_first_moment + asfloat(1. - beta1) * gradient) second_moment = ( beta2 * prev_second_moment + asfloat(1. - beta2) * gradient ** 2 ) first_moment_bias_corrected = first_moment / (1. - beta1 ** epoch) second_moment_bias_corrected = second_moment / (1. - beta2 ** epoch) parameter_delta = first_moment_bias_corrected * ( T.sqrt(second_moment_bias_corrected) + epsilon ) return [ (prev_first_moment, first_moment), (prev_second_moment, second_moment), (parameter, parameter - step * parameter_delta), ]
class Adamax(GradientDescent): """ AdaMax algorithm. Parameters ---------- beta1 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.9``. beta2 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.999``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-7``. step : float Learning rate, defaults to ``0.002``. {GradientDescent.batch_size} {BaseOptimizer.regularizer} {BaseOptimizer.network} {BaseOptimizer.loss} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.signals} {Verbose.verbose} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> mnet = algorithms.Adamax(network) >>> mnet.train(x_train, y_train) References ---------- [1] Diederik P. Kingma, Jimmy Lei Ba Adam: a Method for Stochastic Optimization. https://arxiv.org/pdf/1412.6980.pdf """ step = ScalarVariableProperty(default=0.002) beta1 = ProperFractionProperty(default=0.9) beta2 = ProperFractionProperty(default=0.999) epsilon = NumberProperty(default=1e-7, minval=0) def init_functions(self): self.variables.iteration = tf.Variable( asfloat(1), name='iteration', dtype=tf.float32, ) super(Adamax, self).init_functions() def init_train_updates(self): iteration = self.variables.iteration beta1 = self.beta1 beta2 = self.beta2 updates = [] variables = [] for (_, _), variable in self.network.variables.items(): if variable.trainable: variables.append(variable) gradients = tf.gradients(self.variables.loss, variables) scale = self.step / (1. - beta1 ** iteration) for parameter, gradient in zip(variables, gradients): prev_first_moment = tf.Variable( tf.zeros(parameter.shape), name="{}/prev-first-moment".format(parameter.op.name), dtype=tf.float32, ) prev_weighted_inf_norm = tf.Variable( tf.zeros(parameter.shape), name="{}/prev-weighted-inf-norm".format(parameter.op.name), dtype=tf.float32, ) first_moment = beta1 * prev_first_moment + (1. - beta1) * gradient weighted_inf_norm = tf.maximum( beta2 * prev_weighted_inf_norm, tf.abs(gradient), ) parameter_delta = ( scale * (first_moment / (weighted_inf_norm + self.epsilon))) updates.extend([ (prev_first_moment, first_moment), (prev_weighted_inf_norm, weighted_inf_norm), (parameter, parameter - parameter_delta), ]) updates.append((iteration, iteration + 1)) return updates
class QuasiNewton(NoStepSelection, GradientDescent): """ Quasi-Newton algorithm optimization. Parameters ---------- {GradientDescent.Parameters} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- Simple example >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> qnnet = algorithms.QuasiNewton( ... (2, 3, 1), ... update_function='bfgs', ... verbose=False ... ) >>> qnnet.train(x_train, y_train, epochs=10) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ update_function = ChoiceProperty(default='bfgs', choices={ 'bfgs': bfgs, 'dfp': dfp, 'psb': psb, 'sr1': sr1, }) h0_scale = NumberProperty(default=1, minval=0) gradient_tol = ProperFractionProperty(default=1e-5) def init_variables(self): super(QuasiNewton, self).init_variables() n_params = sum(p.get_value().size for p in iter_parameters(self)) self.variables.update( inv_hessian=theano.shared( name='inv_hessian', value=asfloat(self.h0_scale * np.eye(int(n_params))), ), prev_params=theano.shared( name='prev_params', value=asfloat(np.zeros(n_params)), ), prev_full_gradient=theano.shared( name='prev_full_gradient', value=asfloat(np.zeros(n_params)), ), ) def init_train_updates(self): network_input = self.variables.network_input network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient)) param_delta = -new_inv_hessian.dot(full_gradient) def prediction(step): # TODO: I need to update this ugly solution later updated_params = param_vector + step * param_delta layer_input = network_input start_pos = 0 for layer in self.layers: for param in layer.parameters: end_pos = start_pos + param.size parameter_name, parameter_id = param.name.split('_') setattr( layer, parameter_name, T.reshape(updated_params[start_pos:end_pos], param.shape)) start_pos = end_pos layer_input = layer.output(layer_input) return layer_input def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates
class ART1(BaseNetwork): """ Adaptive Resonance Theory (ART1) Network for binary data clustering. Notes ----- * Weights are not random, so the result will be always reproduceble. Parameters ---------- rho : float Control reset action in training process. Value must be between ``0`` and ``1``, defaults to ``0.5``. n_clusters : int Number of clusters, defaults to ``2``. Min value is also ``2``. {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} Methods ------- train(input_data): Network network will train until it clusters all samples. {BaseSkeleton.predict} {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> data = np.array([ ... [0, 1, 0], ... [1, 0, 0], ... [1, 1, 0], ... ]) >>>> >>> artnet = algorithms.ART1( ... step=2, ... rho=0.7, ... n_clusters=2, ... verbose=False ... ) >>> artnet.predict(data) array([ 0., 1., 1.]) """ rho = ProperFractionProperty(default=0.5) n_clusters = IntProperty(default=2, minval=2) def train(self, input_data): input_data = format_data(input_data) if input_data.ndim != 2: raise ValueError("Input value must be 2 dimensional, got " "{0}".format(input_data.ndim)) data_size = input_data.shape[1] n_clusters = self.n_clusters step = self.step rho = self.rho if list(sort(unique(input_data))) != [0, 1]: raise ValueError("ART1 Network works only with binary matrix, " "all matix must contains only 0 and 1") if not hasattr(self, 'weight_21'): self.weight_21 = ones((data_size, n_clusters)) if not hasattr(self, 'weight_12'): self.weight_12 = step / (step + n_clusters - 1) * self.weight_21.T weight_21 = self.weight_21 weight_12 = self.weight_12 if data_size != weight_21.shape[0]: raise ValueError( "Data dimension is invalid. Get {} columns data set. " "Must be - {} columns".format(data_size, weight_21.shape[0])) classes = zeros(input_data.shape[0]) # Train network for i, p in enumerate(input_data): disabled_neurons = [] reseted_values = [] reset = True while reset: output1 = p input2 = dot(weight_12, output1.T) output2 = zeros(input2.size) input2[disabled_neurons] = -inf winner_index = input2.argmax() output2[winner_index] = 1 expectation = dot(weight_21, output2) output1 = logical_and(p, expectation).astype(int) reset_value = dot(output1.T, output1) / dot(p.T, p) reset = reset_value < rho if reset: disabled_neurons.append(winner_index) reseted_values.append((reset_value, winner_index)) if len(disabled_neurons) >= n_clusters: # Got this case only if we test all possible clusters reset = False winner_index = None if not reset: if winner_index is not None: weight_12[winner_index, :] = (step * output1) / ( step + dot(output1.T, output1) - 1) weight_21[:, winner_index] = output1 else: # Get result with the best `rho` winner_index = max(reseted_values)[1] classes[i] = winner_index return classes def predict(self, input_data): return self.train(input_data)
class BatchNorm(BaseLayer): """ Batch-normalization layer. Parameters ---------- axes : int, tuple with int or None The axis or axes along which normalization is applied. ``None`` means that normalization will be applied over all axes except the first one. In case of 4D tensor it will be equal to ``(0, 1, 2)``. Defaults to ``None``. epsilon : float Epsilon is a positive constant that adds to the standard deviation to prevent the division by zero. Defaults to ``1e-5``. alpha : float Coefficient for the exponential moving average of batch-wise means and standard deviations computed during training; the closer to one, the more it will depend on the last batches seen. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. gamma : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. beta : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_mean : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_inv_std : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} References ---------- .. [1] Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, http://arxiv.org/pdf/1502.03167v3.pdf """ axes = AxesProperty(default=None) epsilon = NumberProperty(default=1e-5, minval=0) alpha = ProperFractionProperty(default=0.1) beta = ParameterProperty(default=init.Constant(value=0)) gamma = ParameterProperty(default=init.Constant(value=1)) running_mean = ParameterProperty(default=init.Constant(value=0)) running_inv_std = ParameterProperty(default=init.Constant(value=1)) def initialize(self): super(BatchNorm, self).initialize() input_shape = as_tuple(None, self.input_shape) ndim = len(input_shape) if self.axes is None: # If ndim == 4 then axes = (0, 1, 2) # If ndim == 2 then axes = (0,) self.axes = tuple(range(ndim - 1)) if any(axis >= ndim for axis in self.axes): raise ValueError("Cannot apply batch normalization on the axis " "that doesn't exist.") opposite_axes = find_opposite_axes(self.axes, ndim) parameter_shape = [ input_shape[axis] if axis in opposite_axes else 1 for axis in range(ndim) ] if any(parameter is None for parameter in parameter_shape): unknown_dim_index = parameter_shape.index(None) raise ValueError("Cannot apply batch normalization on the axis " "with unknown size over the dimension #{} " "(0-based indeces).".format(unknown_dim_index)) self.add_parameter(value=self.running_mean, shape=parameter_shape, name='running_mean', trainable=False) self.add_parameter(value=self.running_inv_std, shape=parameter_shape, name='running_inv_std', trainable=False) self.add_parameter(value=self.gamma, name='gamma', shape=parameter_shape, trainable=True) self.add_parameter(value=self.beta, name='beta', shape=parameter_shape, trainable=True) def output(self, input_value): alpha = asfloat(self.alpha) running_mean = self.running_mean running_inv_std = self.running_inv_std if not self.training_state: mean, inv_std = running_mean, running_inv_std else: mean = tf.reduce_mean( input_value, self.axes, keepdims=True, name="mean", ) variance = tf.reduce_mean( tf.squared_difference(input_value, tf.stop_gradient(mean)), self.axes, keepdims=True, name="variance", ) inv_std = tf.rsqrt(variance + asfloat(self.epsilon)) self.updates = [ (running_inv_std, asfloat(1 - alpha) * running_inv_std + alpha * inv_std), (running_mean, asfloat(1 - alpha) * running_mean + alpha * mean) ] normalized_value = (input_value - mean) * inv_std return self.gamma * normalized_value + self.beta
class RPROP(StepSelectionBuiltIn, BaseGradientDescent): """ Resilient backpropagation (RPROP) is an optimization algorithm for supervised learning. RPROP algorithm takes into account only direction of the gradient and completely ignores its magnitude. Every weight values has a unique step size associated with it (by default all of the are equal to ``step``). The rule is following, when gradient direction changes (sign of the gradient) we decrease step size for specific weight multiplying it by ``decrease_factor`` and if sign stays the same than we increase step size for this specific weight multiplying it by ``increase_factor``. The step size is always bounded by ``minstep`` and ``maxstep``. Notes ----- Algorithm doesn't work with mini-batches. Parameters ---------- minstep : float Minimum possible value for step. Defaults to ``0.001``. maxstep : float Maximum possible value for step. Defaults to ``10``. increase_factor : float Increase factor for step in case when gradient doesn't change sign compare to previous epoch. decrease_factor : float Decrease factor for step in case when gradient changes sign compare to previous epoch. {BaseGradientDescent.Parameters} Attributes ---------- {BaseGradientDescent.Attributes} Methods ------- {BaseGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> rpropnet = algorithms.RPROP((2, 3, 1)) >>> rpropnet.train(x_train, y_train) See Also -------- :network:`IRPROPPlus` : iRPROP+ algorithm. :network:`GradientDescent` : GradientDescent algorithm. """ # This properties correct upper and lower bounds for steps. minstep = BoundedProperty(default=0.001, minval=0) maxstep = BoundedProperty(default=10, minval=0) # This properties increase/decrease step by deviding it to # some coeffitient. increase_factor = BoundedProperty(minval=1, default=1.2) decrease_factor = ProperFractionProperty(default=0.5) def update_prev_delta(self, prev_delta): return prev_delta def init_train_updates(self): updates = [] for layer, parameter, gradient in self.iter_params_and_grads(): with tf.variable_scope(parameter.op.name): steps = tf.Variable( # Steps will be decreased after the first iteration, # because all previous gradients are equal to zero. # In order to make sure that network will use the same # step per every weight we re-scale step and after the # first iteration it will be multiplied by # ``decrease_factor`` and scaled back to the default # step value. tf.ones_like(parameter) * self.step, name="steps", dtype=tf.float32, ) prev_delta = tf.Variable( tf.zeros(parameter.shape), name="prev-delta", dtype=tf.float32, ) # We collect only signs since it ensures numerical stability # after multiplication when we deal with small numbers. prev_gradient_sign = tf.Variable( tf.zeros(parameter.shape), name="prev-grad-sign", dtype=tf.float32, ) updated_prev_delta = self.update_prev_delta(prev_delta) gradient_sign = tf.sign(gradient) grad_sign_product = gradient_sign * prev_gradient_sign gradient_changed_sign = tf.equal(grad_sign_product, -1) updated_steps = tf.clip_by_value( tf.where( tf.equal(grad_sign_product, 1), steps * self.increase_factor, tf.where( gradient_changed_sign, steps * self.decrease_factor, steps, ) ), self.minstep, self.maxstep, ) parameter_delta = tf.where( gradient_changed_sign, # If we subtract previous negative weight update it means # that we will revert weight update that has been applied # in the previous iteration. -updated_prev_delta, updated_steps * gradient_sign, ) # Making sure that during the next iteration sign, after # we multiplied by the new gradient, won't be negative. # Otherwise, the same roll back using previous delta # won't make much sense. clipped_gradient_sign = tf.where( gradient_changed_sign, tf.zeros_like(gradient_sign), gradient_sign, ) updates.extend([ (parameter, parameter - parameter_delta), (steps, updated_steps), (prev_gradient_sign, clipped_gradient_sign), (prev_delta, parameter_delta), ]) return updates
class LeakStepAdaptation(SingleStepConfigurable): """ Leak Learning Rate Adaptation algorithm is a step adaptation procedure in backpropagation algortihm. Parameters ---------- leak_size : float Defaults to ``0.01``. This variable identified proportion, so it's always between 0 and 1. Typically this value is small. alpha : float The ``alpha`` is control total step update ratio. Defaults to ``0.001``. Typically this value is small. beta : float This similar to ``alpha``, but it control ration only for update matrix norms. Defaults to ``20``. Typically this value is bigger than ``1``. Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... addons=[algorithms.LeakStepAdaptation] ... ) References ---------- [1] Noboru M. "Adaptive on-line learning in changing environments", 1997 [2] LeCun, "Efficient BackProp", 1998 """ leak_size = ProperFractionProperty(default=0.01) alpha = BoundedProperty(default=0.001, minval=0) beta = BoundedProperty(default=20, minval=0) def init_variables(self): super(LeakStepAdaptation, self).init_variables() n_parameters = count_parameters(self.connection) self.variables.leak_average = tf.Variable( tf.zeros(n_parameters), name="leak-step-adapt/leak-average", dtype=tf.float32, ) def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = asfloat(self.alpha) beta = asfloat(self.beta) leak_size = asfloat(self.leak_size) step = self.variables.step leak_average = self.variables.leak_average parameters = parameter_values(self.connection) gradients = tf.gradients(self.variables.error_func, parameters) full_gradient = tf.concat([flatten(grad) for grad in gradients], axis=0) leak_avarage_update = ((1 - leak_size) * leak_average + leak_size * full_gradient) new_step = step + alpha * step * (beta * tf.norm(leak_avarage_update) - step) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
class DropBlock(Identity): """ DropBlock, a form of structured dropout, where units in a contiguous region of a feature map are dropped together. Parameters ---------- keep_proba : float Fraction of the input units to keep. Value needs to be between ``0`` and ``1``. block_size : int or tuple Size of the block to be dropped. Blocks that have squared shape can be specified with a single integer value. For example, `block_size=5` the same as `block_size=(5, 5)`. {Identity.name} Methods ------- {Identity.Methods} Attributes ---------- {Identity.Attributes} See Also -------- :layer:`Dropout` : Dropout layer. References ---------- [1] Golnaz Ghiasi, Tsung-Yi Lin, Quoc V. Le. DropBlock: A regularization method for convolutional networks, 2018. Examples -------- >>> from neupy.layers import * >>> network = join( ... Input((28, 28, 1)), ... ... Convolution((3, 3, 16)) >> Relu(), ... DropBlock(keep_proba=0.1, block_size=5), ... ... Convolution((3, 3, 32)) >> Relu(), ... DropBlock(keep_proba=0.1, block_size=5), ... ) """ keep_proba = ProperFractionProperty() block_size = TypedListProperty(n_elements=2) def __init__(self, keep_proba, block_size, name=None): super(DropBlock, self).__init__(name=name) if isinstance(block_size, int): block_size = (block_size, block_size) self.keep_proba = keep_proba self.block_size = block_size def get_output_shape(self, input_shape): input_shape = tf.TensorShape(input_shape) if input_shape and input_shape.ndims != 4: raise LayerConnectionError( "DropBlock layer expects input with 4 dimensions, got {} " "with shape {}".format(len(input_shape), input_shape)) return input_shape def output(self, input, training=False): if not training: return input input = tf.convert_to_tensor(input, tf.float32) input_shape = tf.shape(input) block_height, block_width = self.block_size height, width = input_shape[1], input_shape[2] input_area = asfloat(width * height) block_area = asfloat(block_width * block_height) area = asfloat((width - block_width + 1) * (height - block_height + 1)) mask = bernoulli_sample( mean=(1. - self.keep_proba) * input_area / (block_area * area), shape=[ input_shape[0], height - block_height + 1, width - block_width + 1, input_shape[3], ], ) br_height = (block_height - 1) // 2 tl_height = (block_height - 1) - br_height br_width = (block_width - 1) // 2 tl_width = (block_width - 1) - br_width mask = tf.pad(mask, [ [0, 0], [tl_height, br_height], [tl_width, br_width], [0, 0], ]) mask = tf.nn.max_pool( mask, [1, block_height, block_width, 1], strides=[1, 1, 1, 1], padding='SAME', ) mask = tf.cast(1 - mask, tf.float32) feature_normalizer = asfloat(tf.size(mask)) / tf.reduce_sum(mask) return tf.multiply(input, mask) * feature_normalizer
class LeakStepAdaptation(SingleStepConfigurable): """ Leak Learning Rate Adaptation algorithm for step adaptation procedure in backpropagation algortihm. By default every layer has the same value as ``step`` parameter in network, but after first training epoch they must be different. Parameters ---------- leak_size : float Defaults to ``0.01``. This variable identified proportion, so it's always between 0 and 1. Usualy this value is small. alpha : float The ``alpha`` is control total step update ratio (It's similar to step role in weight update procedure). Defaults to ``0.001``. Typical this value is small. beta : float This similar to ``alpha``, but it control ration only for update matrix norms. Defaults to ``20``. Typical this value is > 1. beta : float Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... addons=[algorithms.LeakStepAdaptation] ... ) >>> .. [1] Noboru M. "Adaptive on-line learning in changing environments", 1997 .. [2] LeCun, "Efficient BackProp", 1998 """ leak_size = ProperFractionProperty(default=0.01) alpha = BoundedProperty(default=0.001, minval=0) beta = BoundedProperty(default=20, minval=0) def init_variables(self): super(LeakStepAdaptation, self).init_variables() n_parameters = count_parameters(self) self.variables.leak_average = theano.shared(value=asfloat( np.zeros(n_parameters)), name='leak_average') def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = self.alpha beta = self.beta leak_size = self.leak_size step = self.variables.step leak_average = self.variables.leak_average parameters = list(iter_parameters(self)) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) leak_avarage_update = ((1 - leak_size) * leak_average + leak_size * full_gradient) new_step = step + alpha * step * ( beta * leak_avarage_update.norm(L=2) - step) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
class HessianDiagonal(NoMultipleStepSelection, GradientDescent): """ Hissian diagonal is a Hessian algorithm approximation which require only computation of hessian matrix diagonal elements and makes it invertion much easier and faster. Parameters ---------- min_eigval : float Set up minimum eigenvalue for Hessian diagonale matrix. After a few iteration elements will be extremly small and matrix inverse produce huge number in hessian diagonal elements. This parameter control diagonal elements size. Defaults to ``1e-2``. {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} Examples -------- Simple example >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> hdnet = algorithms.HessianDiagonal( ... (2, 3, 1), ... verbose=False ... ) >>> hdnet.train(x_train, y_train) Diabets dataset example >>> import numpy as np >>> from sklearn.cross_validation import train_test_split >>> from sklearn import datasets, preprocessing >>> from neupy import algorithms, layers, environment >>> from neupy.estimators import rmsle >>> >>> environment.reproducible() >>> >>> dataset = datasets.load_diabetes() >>> data, target = dataset.data, dataset.target >>> >>> input_scaler = preprocessing.StandardScaler() >>> target_scaler = preprocessing.StandardScaler() >>> >>> x_train, x_test, y_train, y_test = train_test_split( ... input_scaler.fit_transform(data), ... target_scaler.fit_transform(target), ... train_size=0.8 ... ) >>> >>> nw = algorithms.HessianDiagonal( ... connection=[ ... layers.Sigmoid(10), ... layers.Sigmoid(20), ... layers.Output(1) ... ], ... step=1.5, ... shuffle_data=False, ... verbose=False, ... min_eigval=1e-10 ... ) >>> nw.train(x_train, y_train, epochs=10) >>> y_predict = nw.predict(x_test) >>> >>> error = rmsle(target_scaler.inverse_transform(y_test), ... target_scaler.inverse_transform(y_predict).round()) >>> error 0.50315919814691346 See Also -------- :network:`GradientDescent` : GradientDescent algorithm. :network:`Hessian` : Newton's method. """ min_eigval = ProperFractionProperty(default=1e-2) def init_train_updates(self): step = self.variables.step min_eigval = self.min_eigval parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative = T.grad(gradient.sum(), wrt=parameter) second_derivatives.append(second_derivative.flatten()) hessian_diag = T.concatenate(second_derivatives) hessian_diag = T.switch( T.abs_(hessian_diag) < min_eigval, T.switch( hessian_diag < 0, -min_eigval, min_eigval, ), hessian_diag) # We divide gradient by Hessian diagonal elementwise is the same # as we just took diagonal Hessian inverse (which is # reciprocal for each diagonal element) and mutliply # by gradient. This operation is less clear, but works faster. updated_parameters = (param_vector - step * full_gradient / hessian_diag) updates = setup_parameter_updates(parameters, updated_parameters) return updates