示例#1
0
class Dropout(BaseLayer):
    """ Dropout layer

    Parameters
    ----------
    proba : float
        Fraction of the input units to drop. Value needs to be
        between 0 and 1.
    """
    proba = ProperFractionProperty(required=True)

    def __init__(self, proba, **options):
        options['proba'] = proba
        super(Dropout, self).__init__(**options)

    @property
    def size(self):
        return self.relate_to_layer.size

    def output(self, input_value):
        # Use NumPy seed to make Theano code easely reproducible
        max_possible_seed = 4e9
        seed = np.random.randint(max_possible_seed)
        theano_random = T.shared_randomstreams.RandomStreams(seed)

        proba = (1.0 - self.proba)
        mask = theano_random.binomial(n=1,
                                      p=proba,
                                      size=input_value.shape,
                                      dtype=input_value.dtype)
        return (mask * input_value) / proba

    def __repr__(self):
        return "{name}(proba={proba})".format(name=self.__class__.__name__,
                                              proba=self.proba)
示例#2
0
class Dropout(BaseLayer):
    """
    Dropout layer

    Parameters
    ----------
    proba : float
        Fraction of the input units to drop. Value needs to be
        between ``0`` and ``1``.

    {BaseLayer.Parameters}

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}
    """
    proba = ProperFractionProperty(required=True)

    def __init__(self, proba, **options):
        super(Dropout, self).__init__(proba=proba, **options)

    def output(self, input_value):
        if not self.training_state:
            return input_value
        return tf.nn.dropout(input_value, keep_prob=(1.0 - self.proba))

    def __repr__(self):
        classname = self.__class__.__name__
        return "{}(proba={})".format(classname, self.proba)
class Momentum(MinibatchGradientDescent):
    """
    Momentum algorithm.

    Parameters
    ----------
    momentum : float
        Control previous gradient ratio. Defaults to ``0.9``.

    nesterov : bool
        Instead of classic momentum computes Nesterov momentum.
        Defaults to ``False``.

    {MinibatchGradientDescent.Parameters}

    Attributes
    ----------
    {MinibatchGradientDescent.Attributes}

    Methods
    -------
    {MinibatchGradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.Momentum((2, 3, 1))
    >>> mnet.train(x_train, y_train)

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    """
    momentum = ProperFractionProperty(default=0.9)
    nesterov = Property(default=False, expected_type=bool)

    def init_param_updates(self, layer, parameter):
        step = self.variables.step

        parameter_shape = parameter.get_value().shape
        previous_velocity = theano.shared(
            name="{}/previous-velocity".format(parameter.name),
            value=asfloat(np.zeros(parameter_shape)),
        )

        gradient = T.grad(self.variables.error_func, wrt=parameter)
        velocity = self.momentum * previous_velocity - step * gradient

        if self.nesterov:
            velocity = self.momentum * velocity - step * gradient

        return [
            (parameter, parameter + velocity),
            (previous_velocity, velocity),
        ]
示例#4
0
class Dropout(BaseLayer):
    """ Dropout layer

    Parameters
    ----------
    proba : float
        Fraction of the input units to drop. Value needs to be
        between 0 and 1.
    """
    proba = ProperFractionProperty(required=True)

    def __init__(self, proba, **options):
        options['proba'] = proba
        super(Dropout, self).__init__(**options)

    @property
    def size(self):
        return self.relate_to_layer.size

    def output(self, input_value):
        if not self.training_state:
            return input_value

        theano_random = theano_random_stream()
        proba = (1.0 - self.proba)
        mask = theano_random.binomial(n=1, p=proba,
                                      size=input_value.shape,
                                      dtype=input_value.dtype)
        return (mask * input_value) / proba

    def __repr__(self):
        classname = self.__class__.__name__
        return "{}(proba={})".format(classname, self.proba)
示例#5
0
class RMSProp(MinibatchGradientDescent):
    """
    RMSProp algorithm.

    Parameters
    ----------
    decay : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    {MinibatchGradientDescent.Parameters}

    Attributes
    ----------
    {MinibatchGradientDescent.Attributes}

    Methods
    -------
    {MinibatchGradientDescent.Methods}
    """
    decay = ProperFractionProperty(default=0.95)
    epsilon = NumberProperty(default=1e-5, minval=0)

    def init_layers(self):
        super(RMSProp, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_mean_squred_grad = theano.shared(
                    name="prev_mean_squred_grad_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):

        n_parameters = count_parameters(self)
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')

        parameters = list(iter_parameters(self))
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)

        prev_mean_squred_grad = parameter.prev_mean_squred_grad
        step = self.variables.step
        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = (self.decay * prev_mean_squred_grad +
                            (1 - self.decay) * gradient**2)
        parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
class RMSProp(MinibatchGradientDescent):
    """
    RMSProp algorithm.

    Parameters
    ----------
    decay : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.

    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.

    {MinibatchGradientDescent.Parameters}

    Attributes
    ----------
    {MinibatchGradientDescent.Attributes}

    Methods
    -------
    {MinibatchGradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.RMSProp((2, 3, 1))
    >>> mnet.train(x_train, y_train)
    """
    decay = ProperFractionProperty(default=0.95)
    epsilon = NumberProperty(default=1e-5, minval=0)

    def init_param_updates(self, layer, parameter):
        step = self.variables.step

        parameter_shape = T.shape(parameter).eval()
        prev_mean_squred_grad = theano.shared(
            name="{}/prev-mean-squared-grad".format(parameter.name),
            value=asfloat(np.zeros(parameter_shape)),
        )

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = (self.decay * prev_mean_squred_grad +
                            (1 - self.decay) * gradient**2)
        parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
示例#7
0
class RMSProp(MinibatchGradientDescent):
    """ RMSProp algorithm.

    Parameters
    ----------
    decay : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    {MinibatchGradientDescent.batch_size}
    {GradientDescent.addons}
    {ConstructableNetwork.connection}
    {ConstructableNetwork.error}
    {BaseNetwork.step}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    {SupervisedLearning.train}
    {BaseSkeleton.fit}
    {BaseNetwork.plot_errors}
    """
    decay = ProperFractionProperty(default=0.95)
    epsilon = NumberProperty(default=1e-5, minval=0)

    def init_layers(self):
        super(RMSProp, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_mean_squred_grad = theano.shared(
                    name="prev_mean_squred_grad_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):
        prev_mean_squred_grad = parameter.prev_mean_squred_grad
        step = self.variables.step
        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = (self.decay * prev_mean_squred_grad +
                            (1 - self.decay) * gradient**2)
        parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
示例#8
0
class Momentum(GradientDescent):
    """
    Momentum algorithm.

    Parameters
    ----------
    momentum : float
        Control previous gradient ratio. Defaults to ``0.9``.

    nesterov : bool
        Instead of classic momentum computes Nesterov momentum.
        Defaults to ``False``.

    {GradientDescent.Parameters}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>> from neupy.layers import *
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1)
    >>> optimizer = algorithms.Momentum(network)
    >>> optimizer.train(x_train, y_train)

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    """
    momentum = ProperFractionProperty(default=0.9)
    nesterov = Property(default=False, expected_type=bool)

    def init_train_updates(self):
        optimizer = tf.train.MomentumOptimizer(
            use_nesterov=self.nesterov,
            momentum=self.momentum,
            learning_rate=self.step,
        )
        self.functions.optimizer = optimizer
        return [optimizer.minimize(self.variables.loss)]
示例#9
0
class Dropout(Identity):
    """
    Dropout layer. It randomly switches of (multiplies by zero)
    input values, where probability to be switched per each value
    can be controlled with the ``proba`` parameter. For example,
    ``proba=0.2`` will mean that only 20% of the input values will
    be multiplied by 0 and 80% of the will be unchanged.

    It's important to note that output from the dropout is controled by
    the ``training`` parameter in the ``output`` method. Droput
    will be applied only in cases when ``training=True`` propagated
    through the network, otherwise it will act as an identity.

    Parameters
    ----------
    proba : float
        Fraction of the input units to drop. Value needs to be
        between ``0`` and ``1``.

    {Identity.name}

    Methods
    -------
    {Identity.Methods}

    Attributes
    ----------
    {Identity.Attributes}

    Examples
    --------
    >>> from neupy.layers import *
    >>> network = join(
    ...     Input(10),
    ...     Relu(5) >> Dropout(0.5),
    ...     Relu(5) >> Dropout(0.5),
    ...     Sigmoid(1),
    ... )
    >>> network
    (?, 10) -> [... 6 layers ...] -> (?, 1)
    """
    proba = ProperFractionProperty()

    def __init__(self, proba, name=None):
        super(Dropout, self).__init__(name=name)
        self.proba = proba

    def output(self, input_value, training=False):
        if not training:
            return input_value
        return tf.nn.dropout(input_value, keep_prob=(1.0 - self.proba))
示例#10
0
文件: output.py 项目: disc5/neupy
class StepOutput(Output):
    """ The behaviour for this layer is the same as for step function.

    Parameters
    ----------
    output_bounds : tuple
        Value is must be a tuple which contains two elements where first one
        identify lower output value and the second one - bigger. Defaults
        to ``(0, 1)``.
    critical_point : float
        Critical point is set up step function bias. Value equal to this
        point should be equal to the lower bound. Defaults to ``0``.
    {Output.size}
    """
    output_bounds = TypedListProperty(default=(0, 1))
    critical_point = ProperFractionProperty(default=0)

    def output(self, value):
        lower_bound, upper_bound = self.output_bounds
        return np.where(value <= self.critical_point, lower_bound, upper_bound)
class Dropout(BaseLayer):
    """
    Dropout layer

    Parameters
    ----------
    proba : float
        Fraction of the input units to drop. Value needs to be
        between ``0`` and ``1``.

    {BaseLayer.Parameters}

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}
    """
    proba = ProperFractionProperty(required=True)

    def __init__(self, proba, **options):
        super(Dropout, self).__init__(proba=proba, **options)

    def output(self, input_value):
        if not self.training_state:
            return input_value

        theano_random = theano_random_stream()
        proba = (1.0 - self.proba)
        mask = theano_random.binomial(n=1,
                                      p=proba,
                                      size=input_value.shape,
                                      dtype=input_value.dtype)
        return (mask * input_value) / proba

    def __repr__(self):
        classname = self.__class__.__name__
        return "{}(proba={})".format(classname, self.proba)
示例#12
0
文件: errdiff.py 项目: degerli/neupy
class ErrDiffStepUpdate(SingleStepConfigurable):
    """
    This algorithm make step update base on error difference between
    epochs.

    Parameters
    ----------
    update_for_smaller_error : float
        Multiplies this option to ``step`` in if the error
        was less than in previous epochs. Defaults to ``1.05``.
        Value can't be less than ``1``.

    update_for_bigger_error : float
        Multiplies this option to ``step`` in if the error
        was more than in previous epochs. Defaults to ``0.7``.

    error_difference : float
        The value indicates how many had to increase the
        error from the previous epochs that would produce
        reduction step. Defaults to ``1.04``.
        Value can't be less than ``1``.

    Warns
    -----
    {SingleStepConfigurable.Warns}

    Examples
    --------
    >>> from neupy import algorithms
    >>>
    >>> bpnet = algorithms.GradientDescent(
    ...     (2, 4, 1),
    ...     step=0.1,
    ...     verbose=False,
    ...     addons=[algorithms.ErrDiffStepUpdate]
    ... )
    """
    update_for_smaller_error = BoundedProperty(default=1.05, minval=1)
    update_for_bigger_error = ProperFractionProperty(default=0.7)
    error_difference = BoundedProperty(default=1.04, minval=1)

    def init_variables(self):
        self.variables.update(
            last_error=tf.Variable(
                np.nan,
                name='err-diff-step-update/last-error',
            ),
            previous_error=tf.Variable(
                np.nan,
                name='err-diff-step-update/previous-error',
            ),
        )
        super(ErrDiffStepUpdate, self).init_variables()

    def init_train_updates(self):
        updates = super(ErrDiffStepUpdate, self).init_train_updates()

        step = self.variables.step
        last_error = self.variables.last_error
        previous_error = self.variables.previous_error

        step_update_condition = tf.where(
            last_error < previous_error,
            self.update_for_smaller_error * step,
            tf.where(
                last_error > self.update_for_bigger_error * previous_error,
                self.update_for_bigger_error * step,
                step
            )
        )
        updates.append((step, step_update_condition))
        return updates

    def on_epoch_start_update(self, epoch):
        super(ErrDiffStepUpdate, self).on_epoch_start_update(epoch)

        previous_error = self.errors.previous()
        if previous_error:
            session = tensorflow_session()
            last_error = self.errors.last()

            self.variables.last_error.load(last_error, session)
            self.variables.previous_error.load(previous_error, session)
示例#13
0
文件: momentum.py 项目: disc5/neupy
class Momentum(MinibatchGradientDescent):
    """ Momentum algorithm for :network:`GradientDescent` optimization.

    Parameters
    ----------
    momentum : float
        Control previous gradient ratio. Defaults to ``0.9``.
    nesterov : bool
        Instead of classic momentum computes Nesterov momentum.
        Defaults to ``False``.
    {MinibatchGradientDescent.batch_size}
    {GradientDescent.addons}
    {ConstructableNetwork.connection}
    {ConstructableNetwork.error}
    {BaseNetwork.step}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    {SupervisedLearning.train}
    {BaseSkeleton.fit}
    {BaseNetwork.plot_errors}

    Examples
    --------
    Simple example

    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.Momentum(
    ...     (2, 3, 1),
    ...     verbose=False
    ... )
    >>> mnet.train(x_train, y_train)

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    """
    momentum = ProperFractionProperty(default=0.9)
    nesterov = Property(default=False, expected_type=bool)

    def init_layers(self):
        super(Momentum, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_param_delta = theano.shared(
                    name="prev_param_delta_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):
        step = self.variables.step
        gradient = T.grad(self.variables.error_func, wrt=parameter)

        prev_param_delta = parameter.prev_param_delta
        parameter_delta = self.momentum * prev_param_delta - step * gradient

        if self.nesterov:
            parameter_delta = self.momentum * parameter_delta - step * gradient

        return [
            (parameter, parameter + parameter_delta),
            (prev_param_delta, parameter_delta),
        ]
示例#14
0
class GrowingNeuralGas(BaseNetwork):
    """
    Growing Neural Gas (GNG) algorithm.

    Current algorithm has two modifications that hasn't been mentioned
    in the paper, but they help to speed up training.

    - The ``n_start_nodes`` parameter provides possibility to increase
      number of nodes during initialization step. It's useful when
      algorithm takes a lot of time building up large amount of neurons.

    - The ``min_distance_for_update`` parameter allows to speed up
      training when some data samples has neurons very close to them. The
      ``min_distance_for_update`` parameter controls threshold for the
      minimum distance for which we will want to update weights.

    Parameters
    ----------
    n_inputs : int
        Number of features in each sample.

    n_start_nodes : int
        Number of nodes that algorithm generates from the data during
        the initialization step. Defaults to ``2``.

    step : float
        Step (learning rate) for the neuron winner. Defaults to ``0.2``.

    neighbour_step : float
        Step (learning rate) for the neurons that connected via edges
        with neuron winner. This value typically has to be smaller than
        ``step`` value. Defaults to ``0.05``.

    max_edge_age : int
        It means that if edge won't be updated for ``max_edge_age`` iterations
        than it would be removed. The larger the value the more updates we
        allow to do before removing edge. Defaults to ``100``.

    n_iter_before_neuron_added : int
        Each ``n_iter_before_neuron_added`` weight update algorithm add new
        neuron. The smaller the value the more frequently algorithm adds
        new neurons to the network. Defaults to ``1000``.

    error_decay_rate : float
        This error decay rate would be applied to every neuron in the
        graph after each training iteration. It ensures that old errors
        will be reduced over time. Defaults to ``0.995``.

    after_split_error_decay_rate : float
        This decay rate reduces error for neurons with largest errors
        after algorithm added new neuron. This value typically lower than
        ``error_decay_rate``. Defaults to ``0.5``.

    max_nodes : int
        Maximum number of nodes that would be generated during the training.
        This parameter won't stop training when maximum number of nodes
        will be exceeded. Defaults to ``1000``.

    min_distance_for_update : float
        Parameter controls for which neurons we want to apply updates.
        In case if euclidean distance between data sample and closest
        neurons will be less than the ``min_distance_for_update`` value than
        update would be skipped for this data sample. Setting value to zero
        will disable effect provided by this parameter. Defaults to ``0``.

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.signals}

    {Verbose.verbose}

    Methods
    -------
    train(X_train, epochs=100)
        Network learns topological structure of the data. Learned
        structure will be stored in the ``graph`` attribute.

    {BaseSkeleton.fit}

    initialize_nodes(data)
        Network initializes nodes randomly sampling ``n_start_nodes``
        from the data. It would be applied automatically before
        the training in case if graph is empty.

        Note: Node re-initialization can reset network.

    Notes
    -----
    - Unlike other algorithms this network doesn't make predictions.
      Instead, it learns topological structure of the data in form of
      the graph. After that training, structure of the network can be
      extracted from the ``graph`` attribute.

    - In order to speed up training, it might be useful to increase
      the ``n_start_nodes`` parameter.

    - During the training it happens that nodes learn topological
      structure of one part of the data better than the other, mostly
      because of the different data sample density in different places.
      Increasing the ``min_distance_for_update`` can speed up training
      ignoring updates for the neurons that very close to the data sample.
      (below specified ``min_distance_for_update`` value). Training can be
      stopped in case if none of the neurons has been updated during
      the training epoch.

    Attributes
    ----------
    graph : NeuralGasGraph instance
        This attribute stores all neurons and connections between them
        in the form of undirected graph.

    {BaseNetwork.Attributes}

    Examples
    --------
    >>> from neupy import algorithms
    >>> from sklearn.datasets import make_blobs
    >>>
    >>> data, _ = make_blobs(
    ...     n_samples=1000,
    ...     n_features=2,
    ...     centers=2,
    ...     cluster_std=0.4,
    ... )
    >>>
    >>> neural_gas = algorithms.GrowingNeuralGas(
    ...     n_inputs=2,
    ...     shuffle_data=True,
    ...     verbose=True,
    ...     max_edge_age=10,
    ...     n_iter_before_neuron_added=50,
    ...     max_nodes=100,
    ... )
    >>> neural_gas.graph.n_nodes
    100
    >>> len(neural_gas.graph.edges)
    175
    >>> edges = list(neural_gas.graph.edges.keys())
    >>> neuron_1, neuron_2 = edges[0]
    >>>
    >>> neuron_1.weight
    array([[-6.77166299,  2.4121606 ]])
    >>> neuron_2.weight
    array([[-6.829309  ,  2.27839633]])

    References
    ----------
    [1] A Growing Neural Gas Network Learns Topologies, Bernd Fritzke
    """
    n_inputs = IntProperty(minval=1, required=True)
    n_start_nodes = IntProperty(minval=2, default=2)

    step = NumberProperty(default=0.2, minval=0)
    neighbour_step = NumberProperty(default=0.05, minval=0)
    max_edge_age = IntProperty(default=100, minval=1)
    max_nodes = IntProperty(default=1000, minval=1)

    n_iter_before_neuron_added = IntProperty(default=1000, minval=1)
    after_split_error_decay_rate = ProperFractionProperty(default=0.5)
    error_decay_rate = ProperFractionProperty(default=0.995)
    min_distance_for_update = NumberProperty(default=0.0, minval=0)

    def __init__(self, *args, **kwargs):
        super(GrowingNeuralGas, self).__init__(*args, **kwargs)
        self.n_updates = 0
        self.graph = NeuralGasGraph()

    def format_input_data(self, X):
        is_feature1d = self.n_inputs == 1
        X = format_data(X, is_feature1d)

        if X.ndim != 2:
            raise ValueError("Cannot make prediction, because input "
                             "data has more than 2 dimensions")

        n_samples, n_features = X.shape

        if n_features != self.n_inputs:
            raise ValueError("Input data expected to have {} features, "
                             "but got {}".format(self.n_inputs, n_features))

        return X

    def initialize_nodes(self, data):
        self.graph = NeuralGasGraph()

        for sample in sample_data_point(data, n=self.n_start_nodes):
            self.graph.add_node(NeuronNode(sample.reshape(1, -1)))

    def train(self, X_train, epochs=100):
        X_train = self.format_input_data(X_train)

        if not self.graph.nodes:
            self.initialize_nodes(X_train)

        return super(GrowingNeuralGas, self).train(
            X_train=X_train, y_train=None,
            X_test=None, y_test=None,
            epochs=epochs)

    def one_training_update(self, X_train, y_train=None):
        graph = self.graph
        step = self.step
        neighbour_step = self.neighbour_step

        max_nodes = self.max_nodes
        max_edge_age = self.max_edge_age

        error_decay_rate = self.error_decay_rate
        after_split_error_decay_rate = self.after_split_error_decay_rate
        n_iter_before_neuron_added = self.n_iter_before_neuron_added

        # We square this value, because we deal with
        # squared distances during the training.
        min_distance_for_update = np.square(self.min_distance_for_update)

        n_samples = len(X_train)
        total_error = 0
        did_update = False

        for sample in X_train:
            nodes = graph.nodes
            weights = np.concatenate([node.weight for node in nodes])

            distance = np.linalg.norm(weights - sample, axis=1)
            neuron_ids = np.argsort(distance)

            closest_neuron_id, second_closest_id = neuron_ids[:2]
            closest_neuron = nodes[closest_neuron_id]
            second_closest = nodes[second_closest_id]
            total_error += distance[closest_neuron_id]

            if distance[closest_neuron_id] < min_distance_for_update:
                continue

            self.n_updates += 1
            did_update = True

            closest_neuron.error += distance[closest_neuron_id]
            closest_neuron.weight += step * (sample - closest_neuron.weight)

            graph.add_edge(closest_neuron, second_closest)

            for to_neuron in list(graph.edges_per_node[closest_neuron]):
                edge_id = graph.find_edge_id(to_neuron, closest_neuron)
                age = graph.edges[edge_id]

                if age >= max_edge_age:
                    graph.remove_edge(to_neuron, closest_neuron)

                    if not graph.edges_per_node[to_neuron]:
                        graph.remove_node(to_neuron)

                else:
                    graph.edges[edge_id] += 1
                    to_neuron.weight += neighbour_step * (
                        sample - to_neuron.weight)

            time_to_add_new_neuron = (
                self.n_updates % n_iter_before_neuron_added == 0 and
                graph.n_nodes < max_nodes)

            if time_to_add_new_neuron:
                nodes = graph.nodes
                largest_error_neuron = max(nodes, key=attrgetter('error'))
                neighbour_neuron = max(
                    graph.edges_per_node[largest_error_neuron],
                    key=attrgetter('error'))

                largest_error_neuron.error *= after_split_error_decay_rate
                neighbour_neuron.error *= after_split_error_decay_rate

                new_weight = 0.5 * (
                    largest_error_neuron.weight + neighbour_neuron.weight
                )
                new_neuron = NeuronNode(weight=new_weight.reshape(1, -1))

                graph.remove_edge(neighbour_neuron, largest_error_neuron)
                graph.add_node(new_neuron)
                graph.add_edge(largest_error_neuron, new_neuron)
                graph.add_edge(neighbour_neuron, new_neuron)

            for node in graph.nodes:
                node.error *= error_decay_rate

        if not did_update and min_distance_for_update != 0 and n_samples > 1:
            raise StopTraining(
                "Distance between every data sample and neurons, closest "
                "to them, is less then {}".format(min_distance_for_update))

        return total_error / n_samples

    def predict(self, *args, **kwargs):
        raise NotImplementedError(
            "Growing Neural Gas algorithm doesn't make prediction. "
            "It only learns graph structure from the data "
            "(class has `graph` attribute). ")
示例#15
0
文件: adamax.py 项目: disc5/neupy
class Adamax(MinibatchGradientDescent):
    """ AdaMax algorithm.

    Parameters
    ----------
    beta1 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    beta2 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    step : float
        Learning rate, defaults to ``0.001``.
    {MinibatchGradientDescent.batch_size}
    {GradientDescent.addons}
    {ConstructableNetwork.connection}
    {ConstructableNetwork.error}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    {SupervisedLearning.train}
    {BaseSkeleton.fit}
    {BaseNetwork.plot_errors}
    """
    step = NumberProperty(default=0.001, minval=0)
    beta1 = ProperFractionProperty(default=0.9)
    beta2 = ProperFractionProperty(default=0.999)
    epsilon = NumberProperty(default=1e-8, minval=0)

    def init_layers(self):
        super(Adamax, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_first_moment = theano.shared(
                    name="prev_first_moment_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )
                parameter.prev_weighted_inf_norm = theano.shared(
                    name="prev_weighted_inf_norm_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):
        epoch = self.variables.epoch
        prev_first_moment = parameter.prev_first_moment
        prev_weighted_inf_norm = parameter.prev_weighted_inf_norm

        step = self.variables.step
        beta1 = self.beta1
        beta2 = self.beta2

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient
        weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm,
                                      T.abs_(gradient))

        parameter_delta = ((1 / (1 - beta1**epoch)) *
                           (first_moment / (weighted_inf_norm + self.epsilon)))

        return [
            (prev_first_moment, first_moment),
            (prev_weighted_inf_norm, weighted_inf_norm),
            (parameter, parameter - step * parameter_delta),
        ]
示例#16
0
class BatchNorm(BaseLayer):
    """
    Batch-normalization layer.

    Parameters
    ----------
    axes : int, tuple with int or None
        The axis or axes along which normalization is applied.
        ``None`` means that normalization will be applied over
        all axes except the first one. In case of 4D tensor it will
        be equal to ``(0, 2, 3)``. Defaults to ``None``.
    epsilon : float
        Epsilon is a positive constant that adds to the standard
        deviation to prevent the division by zero.
        Defaults to ``1e-5``.
    alpha : float
        Coefficient for the exponential moving average of
        batch-wise means and standard deviations computed during
        training; the closer to one, the more it will depend on
        the last batches seen. Value needs to be between ``0`` and ``1``.
        Defaults to ``0.1``.
    gamma : array-like, Theano variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.
    beta : array-like, Theano variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}

    References
    ----------
    .. [1] Batch Normalization: Accelerating Deep Network Training
           by Reducing Internal Covariate Shift,
           http://arxiv.org/pdf/1502.03167v3.pdf
    """
    axes = AxesProperty(default=None)
    alpha = ProperFractionProperty(default=0.1)
    epsilon = NumberProperty(default=1e-5, minval=0)
    gamma = ParameterProperty(default=Constant(value=1))
    beta = ParameterProperty(default=Constant(value=0))

    def initialize(self):
        super(BatchNorm, self).initialize()

        input_shape = as_tuple(None, self.input_shape)
        ndim = len(input_shape)

        if self.axes is None:
            # If ndim == 4 then axes = (0, 2, 3)
            # If ndim == 2 then axes = (0,)
            self.axes = tuple(axis for axis in range(ndim) if axis != 1)

        if any(axis >= ndim for axis in self.axes):
            raise ValueError("Cannot apply batch normalization on the axis "
                             "that doesn't exist.")

        opposite_axes = find_opposite_axes(self.axes, ndim)
        parameter_shape = [input_shape[axis] for axis in opposite_axes]

        if any(parameter is None for parameter in parameter_shape):
            unknown_dim_index = parameter_shape.index(None)
            raise ValueError("Cannot apply batch normalization on the axis "
                             "with unknown size over the dimension #{} "
                             "(0-based indeces).".format(unknown_dim_index))

        self.running_mean = theano.shared(
            name='running_mean_{}'.format(self.layer_id),
            value=asfloat(np.zeros(parameter_shape)))
        self.running_inv_std = theano.shared(
            name='running_inv_std_{}'.format(self.layer_id),
            value=asfloat(np.ones(parameter_shape)))

        if isinstance(self.gamma, Initializer):
            self.gamma = self.gamma.sample(parameter_shape)

        if isinstance(self.beta, Initializer):
            self.beta = self.beta.sample(parameter_shape)

        self.gamma = theano.shared(
            name='gamma_{}'.format(self.layer_id),
            value=asfloat(self.gamma),
        )
        self.beta = theano.shared(
            name='beta_{}'.format(self.layer_id),
            value=asfloat(self.beta),
        )
        self.parameters = [self.gamma, self.beta]

    def output(self, input_value):
        epsilon = asfloat(self.epsilon)
        alpha = asfloat(self.alpha)
        gamma, beta = self.gamma, self.beta

        ndim = input_value.ndim
        axes = self.axes

        running_mean = self.running_mean
        running_inv_std = self.running_inv_std

        input_mean = input_value.mean(axes)
        input_var = input_value.var(axes)
        input_inv_std = T.inv(T.sqrt(input_var + epsilon))

        self.updates = [
            (running_inv_std,
             asfloat(1 - alpha) * running_inv_std + alpha * input_inv_std),
            (running_mean,
             asfloat(1 - alpha) * running_mean + alpha * input_mean)
        ]

        if not self.training_state:
            mean = running_mean
            inv_std = running_inv_std

        else:
            mean = input_mean
            inv_std = input_inv_std

        opposite_axes = find_opposite_axes(axes, ndim)

        beta = dimshuffle(beta, ndim, opposite_axes)
        gamma = dimshuffle(gamma, ndim, opposite_axes)
        mean = dimshuffle(mean, ndim, opposite_axes)
        inv_std = dimshuffle(inv_std, ndim, opposite_axes)

        normalized_value = (input_value - mean) * inv_std
        return gamma * normalized_value + beta
示例#17
0
文件: adam.py 项目: degerli/neupy
class Adam(GradientDescent):
    """
    Adam algorithm.

    Parameters
    ----------
    beta1 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.

    beta2 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.

    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.

    step : float
        Learning rate, defaults to ``0.001``.

    {GradientDescent.batch_size}

    {BaseGradientDescent.addons}

    {ConstructibleNetwork.connection}

    {ConstructibleNetwork.error}

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.epoch_end_signal}

    {BaseNetwork.train_end_signal}

    {Verbose.verbose}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    References
    ----------
    [1] Diederik P. Kingma, Jimmy Lei Ba
        Adam: a Method for Stochastic Optimization.
        https://arxiv.org/pdf/1412.6980.pdf

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.Adam((2, 3, 1))
    >>> mnet.train(x_train, y_train)
    """
    step = NumberProperty(default=0.001, minval=0)
    beta1 = ProperFractionProperty(default=0.9)
    beta2 = ProperFractionProperty(default=0.999)
    epsilon = NumberProperty(default=1e-7, minval=0)

    def init_variables(self):
        super(Adam, self).init_variables()

        self.variables.iteration = tf.Variable(
            asfloat(1),
            name='iteration',
            dtype=tf.float32,
        )

    def init_train_updates(self):
        updates = []

        iteration = self.variables.iteration
        step = self.variables.step

        # Since beta1 and beta2 are typically close to 1 and initial
        # values for first and second moments are close to zero the
        # initial estimates for these moments will be biased towards zero.
        # In order to solve this problem we need to correct this bias
        # by rescaling moments with large values during first updates
        # and vanishing this scaling factor more and more after every
        # update.
        #
        # Note that bias correction factor has been changed in order
        # to improve computational speed (suggestion from the original
        # paper).
        bias_correction = (
            tf.sqrt(1. - self.beta2 ** iteration) /
            (1. - self.beta1 ** iteration)
        )

        for layer, parameter, gradient in self.iter_params_and_grads():
            prev_first_moment = tf.Variable(
                tf.zeros(parameter.shape),
                name="{}/prev-first-moment".format(parameter.op.name),
                dtype=tf.float32,
            )
            prev_second_moment = tf.Variable(
                tf.zeros(parameter.shape),
                name="{}/prev-second-moment".format(parameter.op.name),
                dtype=tf.float32,
            )

            first_moment = (
                self.beta1 * prev_first_moment +
                (1. - self.beta1) * gradient
            )
            second_moment = (
                self.beta2 * prev_second_moment +
                (1. - self.beta2) * gradient ** 2
            )

            parameter_delta = bias_correction * first_moment / (
                tf.sqrt(second_moment) + self.epsilon)

            updates.extend([
                (prev_first_moment, first_moment),
                (prev_second_moment, second_moment),
                (parameter, parameter - step * parameter_delta),
            ])

        updates.append((iteration, iteration + 1))
        return updates
示例#18
0
 class A(Configurable):
     fraction = ProperFractionProperty()
示例#19
0
文件: adadelta.py 项目: wjianxz/neupy
class Adadelta(GradientDescent):
    """
    Adadelta algorithm.

    Parameters
    ----------
    rho : float
        Decay rate. Value need to be between ``0``
        and ``1``. Defaults to ``0.95``.

    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-7``.

    step : float
        Learning rate, defaults to ``1.0``. Original paper doesn't have
        learning rate specified in the paper. Step value equal to ``1.0``
        allow to achive the same effect, since multiplication by one won't
        have any effect on the update.

    {GradientDescent.batch_size}

    {BaseOptimizer.regularizer}

    {BaseOptimizer.network}

    {BaseOptimizer.loss}

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.signals}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>> from neupy.layers import *
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1)
    >>> optimizer = algorithms.Adadelta(network)
    >>> optimizer.train(x_train, y_train)

    References
    ----------
    [1] Matthew D. Zeiler,
        ADADELTA: An Adaptive Learning Rate Method
        https://arxiv.org/pdf/1212.5701.pdf
    """
    step = ScalarVariableProperty(default=1.0)
    rho = ProperFractionProperty(default=0.95)
    epsilon = NumberProperty(default=1e-7, minval=0)

    def init_train_updates(self):
        optimizer = tf.train.AdadeltaOptimizer(
            rho=self.rho,
            epsilon=self.epsilon,
            learning_rate=self.step,
        )
        self.functions.optimizer = optimizer
        return [optimizer.minimize(self.variables.loss)]
示例#20
0
文件: hessdiag.py 项目: wjianxz/neupy
class HessianDiagonal(BaseOptimizer):
    """
    Hissian diagonal is a Hessian algorithm approximation which require
    only computation of hessian matrix diagonal elements and makes it
    invertion much easier and faster.

    Parameters
    ----------
    min_eigval : float
        Set up minimum eigenvalue for Hessian diagonale matrix. After a few
        iteration elements will be extremly small and matrix inverse
        produce huge number in hessian diagonal elements. This
        parameter control diagonal elements size. Defaults to ``1e-2``.

    {BaseOptimizer.Parameters}

    Attributes
    ----------
    {BaseOptimizer.Attributes}

    Methods
    -------
    {BaseOptimizer.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>> from neupy.layers import *
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1)
    >>> optimizer = algorithms.HessianDiagonal(network)
    >>> optimizer.train(x_train, y_train)

    Notes
    -----
    - Method requires all training data during propagation, which means
      it's not allowed to use mini-batches.

    See Also
    --------
    :network:`BaseOptimizer` : BaseOptimizer algorithm.
    :network:`Hessian` : Newton's method.
    """
    min_eigval = ProperFractionProperty(default=1e-2)

    def init_train_updates(self):
        step = self.step
        inv_min_eigval = 1 / self.min_eigval
        variables = self.network.variables
        parameters = [var for var in variables.values() if var.trainable]
        param_vector = make_single_vector(parameters)

        gradients = tf.gradients(self.variables.loss, parameters)
        full_gradient = make_single_vector(gradients)

        second_derivatives = []
        for parameter, gradient in zip(parameters, gradients):
            second_derivative, = tf.gradients(gradient, parameter)
            second_derivatives.append(flatten(second_derivative))

        hessian_diag = tf.concat(second_derivatives, axis=0)

        # it's easier to clip inverse hessian rather than the hessian,.
        inv_hessian_diag = tf.clip_by_value(
            # inverse for diagonal matrix easy to compute with
            # elementwise inverse operation.
            1 / hessian_diag,
            -inv_min_eigval,
            inv_min_eigval,
        )
        updates = setup_parameter_updates(
            parameters, param_vector - step * full_gradient * inv_hessian_diag)
        return updates
示例#21
0
class Adam(MinibatchGradientDescent):
    """ Adam algorithm.

    Parameters
    ----------
    beta1 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    beta2 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    step : float
        Learning rate, defaults to ``0.001``.
    {MinibatchGradientDescent.batch_size}
    {GradientDescent.addons}
    {ConstructableNetwork.connection}
    {ConstructableNetwork.error}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    {SupervisedLearning.train}
    {BaseSkeleton.fit}
    """
    step = NumberProperty(default=0.001, minval=0)
    beta1 = ProperFractionProperty(default=0.9)
    beta2 = ProperFractionProperty(default=0.999)
    epsilon = NumberProperty(default=1e-7, minval=0)

    def init_layers(self):
        super(Adam, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_first_moment = theano.shared(
                    name="prev_first_moment_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )
                parameter.prev_second_moment = theano.shared(
                    name="prev_second_moment_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):
        epoch = self.variables.epoch
        prev_first_moment = parameter.prev_first_moment
        prev_second_moment = parameter.prev_second_moment

        step = asfloat(self.variables.step)
        beta1 = asfloat(self.beta1)
        beta2 = asfloat(self.beta2)
        epsilon = asfloat(self.epsilon)

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        first_moment = (
            beta1 * prev_first_moment +
            asfloat(1. - beta1) * gradient)
        second_moment = (
            beta2 * prev_second_moment +
            asfloat(1. - beta2) * gradient ** 2
        )

        first_moment_bias_corrected = first_moment / (1. - beta1 ** epoch)
        second_moment_bias_corrected = second_moment / (1. - beta2 ** epoch)

        parameter_delta = first_moment_bias_corrected * (
            T.sqrt(second_moment_bias_corrected) + epsilon
        )

        return [
            (prev_first_moment, first_moment),
            (prev_second_moment, second_moment),
            (parameter, parameter - step * parameter_delta),
        ]
示例#22
0
class Adamax(GradientDescent):
    """
    AdaMax algorithm.

    Parameters
    ----------
    beta1 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.9``.

    beta2 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.999``.

    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-7``.

    step : float
        Learning rate, defaults to ``0.002``.

    {GradientDescent.batch_size}

    {BaseOptimizer.regularizer}

    {BaseOptimizer.network}

    {BaseOptimizer.loss}

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.signals}

    {Verbose.verbose}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>> from neupy.layers import *
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1)
    >>> mnet = algorithms.Adamax(network)
    >>> mnet.train(x_train, y_train)

    References
    ----------
    [1] Diederik P. Kingma, Jimmy Lei Ba
        Adam: a Method for Stochastic Optimization.
        https://arxiv.org/pdf/1412.6980.pdf
    """
    step = ScalarVariableProperty(default=0.002)
    beta1 = ProperFractionProperty(default=0.9)
    beta2 = ProperFractionProperty(default=0.999)
    epsilon = NumberProperty(default=1e-7, minval=0)

    def init_functions(self):
        self.variables.iteration = tf.Variable(
            asfloat(1),
            name='iteration',
            dtype=tf.float32,
        )
        super(Adamax, self).init_functions()

    def init_train_updates(self):
        iteration = self.variables.iteration
        beta1 = self.beta1
        beta2 = self.beta2

        updates = []
        variables = []

        for (_, _), variable in self.network.variables.items():
            if variable.trainable:
                variables.append(variable)

        gradients = tf.gradients(self.variables.loss, variables)
        scale = self.step / (1. - beta1 ** iteration)

        for parameter, gradient in zip(variables, gradients):
            prev_first_moment = tf.Variable(
                tf.zeros(parameter.shape),
                name="{}/prev-first-moment".format(parameter.op.name),
                dtype=tf.float32,
            )
            prev_weighted_inf_norm = tf.Variable(
                tf.zeros(parameter.shape),
                name="{}/prev-weighted-inf-norm".format(parameter.op.name),
                dtype=tf.float32,
            )

            first_moment = beta1 * prev_first_moment + (1. - beta1) * gradient
            weighted_inf_norm = tf.maximum(
                beta2 * prev_weighted_inf_norm,
                tf.abs(gradient),
            )

            parameter_delta = (
                scale * (first_moment / (weighted_inf_norm + self.epsilon)))

            updates.extend([
                (prev_first_moment, first_moment),
                (prev_weighted_inf_norm, weighted_inf_norm),
                (parameter, parameter - parameter_delta),
            ])

        updates.append((iteration, iteration + 1))
        return updates
示例#23
0
class QuasiNewton(NoStepSelection, GradientDescent):
    """
    Quasi-Newton algorithm optimization.

    Parameters
    ----------
    {GradientDescent.Parameters}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    Simple example

    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> qnnet = algorithms.QuasiNewton(
    ...     (2, 3, 1),
    ...     update_function='bfgs',
    ...     verbose=False
    ... )
    >>> qnnet.train(x_train, y_train, epochs=10)

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    """
    update_function = ChoiceProperty(default='bfgs',
                                     choices={
                                         'bfgs': bfgs,
                                         'dfp': dfp,
                                         'psb': psb,
                                         'sr1': sr1,
                                     })
    h0_scale = NumberProperty(default=1, minval=0)
    gradient_tol = ProperFractionProperty(default=1e-5)

    def init_variables(self):
        super(QuasiNewton, self).init_variables()
        n_params = sum(p.get_value().size for p in iter_parameters(self))
        self.variables.update(
            inv_hessian=theano.shared(
                name='inv_hessian',
                value=asfloat(self.h0_scale * np.eye(int(n_params))),
            ),
            prev_params=theano.shared(
                name='prev_params',
                value=asfloat(np.zeros(n_params)),
            ),
            prev_full_gradient=theano.shared(
                name='prev_full_gradient',
                value=asfloat(np.zeros(n_params)),
            ),
        )

    def init_train_updates(self):
        network_input = self.variables.network_input
        network_output = self.variables.network_output
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=params)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        new_inv_hessian = ifelse(
            T.eq(self.variables.epoch, 1), inv_hessian,
            self.update_function(inv_hessian, param_vector - prev_params,
                                 full_gradient - prev_full_gradient))
        param_delta = -new_inv_hessian.dot(full_gradient)

        def prediction(step):
            # TODO: I need to update this ugly solution later
            updated_params = param_vector + step * param_delta

            layer_input = network_input
            start_pos = 0
            for layer in self.layers:
                for param in layer.parameters:
                    end_pos = start_pos + param.size
                    parameter_name, parameter_id = param.name.split('_')
                    setattr(
                        layer, parameter_name,
                        T.reshape(updated_params[start_pos:end_pos],
                                  param.shape))
                    start_pos = end_pos
                layer_input = layer.output(layer_input)
            return layer_input

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            return T.grad(error_func, wrt=step)

        step = asfloat(line_search(phi, derphi))
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        updates.extend([
            (inv_hessian, new_inv_hessian),
            (prev_params, param_vector),
            (prev_full_gradient, full_gradient),
        ])

        return updates
示例#24
0
class ART1(BaseNetwork):
    """ Adaptive Resonance Theory (ART1) Network for binary
    data clustering.

    Notes
    -----
    * Weights are not random, so the result will be always reproduceble.

    Parameters
    ----------
    rho : float
        Control reset action in training process. Value must be
        between ``0`` and ``1``, defaults to ``0.5``.
    n_clusters : int
        Number of clusters, defaults to ``2``. Min value is also ``2``.
    {BaseNetwork.step}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}

    Methods
    -------
    train(input_data):
        Network network will train until it clusters all samples.
    {BaseSkeleton.predict}
    {BaseSkeleton.fit}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> data = np.array([
    ...     [0, 1, 0],
    ...     [1, 0, 0],
    ...     [1, 1, 0],
    ... ])
    >>>>
    >>> artnet = algorithms.ART1(
    ...     step=2,
    ...     rho=0.7,
    ...     n_clusters=2,
    ...     verbose=False
    ... )
    >>> artnet.predict(data)
    array([ 0.,  1.,  1.])
    """
    rho = ProperFractionProperty(default=0.5)
    n_clusters = IntProperty(default=2, minval=2)

    def train(self, input_data):
        input_data = format_data(input_data)

        if input_data.ndim != 2:
            raise ValueError("Input value must be 2 dimensional, got "
                             "{0}".format(input_data.ndim))

        data_size = input_data.shape[1]
        n_clusters = self.n_clusters
        step = self.step
        rho = self.rho

        if list(sort(unique(input_data))) != [0, 1]:
            raise ValueError("ART1 Network works only with binary matrix, "
                             "all matix must contains only 0 and 1")

        if not hasattr(self, 'weight_21'):
            self.weight_21 = ones((data_size, n_clusters))

        if not hasattr(self, 'weight_12'):
            self.weight_12 = step / (step + n_clusters - 1) * self.weight_21.T

        weight_21 = self.weight_21
        weight_12 = self.weight_12

        if data_size != weight_21.shape[0]:
            raise ValueError(
                "Data dimension is invalid. Get {} columns data set. "
                "Must be - {} columns".format(data_size, weight_21.shape[0]))

        classes = zeros(input_data.shape[0])

        # Train network
        for i, p in enumerate(input_data):
            disabled_neurons = []
            reseted_values = []
            reset = True

            while reset:
                output1 = p
                input2 = dot(weight_12, output1.T)

                output2 = zeros(input2.size)
                input2[disabled_neurons] = -inf
                winner_index = input2.argmax()
                output2[winner_index] = 1

                expectation = dot(weight_21, output2)
                output1 = logical_and(p, expectation).astype(int)

                reset_value = dot(output1.T, output1) / dot(p.T, p)
                reset = reset_value < rho

                if reset:
                    disabled_neurons.append(winner_index)
                    reseted_values.append((reset_value, winner_index))

                if len(disabled_neurons) >= n_clusters:
                    # Got this case only if we test all possible clusters
                    reset = False
                    winner_index = None

                if not reset:
                    if winner_index is not None:
                        weight_12[winner_index, :] = (step * output1) / (
                            step + dot(output1.T, output1) - 1)
                        weight_21[:, winner_index] = output1
                    else:
                        # Get result with the best `rho`
                        winner_index = max(reseted_values)[1]

                    classes[i] = winner_index

        return classes

    def predict(self, input_data):
        return self.train(input_data)
示例#25
0
class BatchNorm(BaseLayer):
    """
    Batch-normalization layer.

    Parameters
    ----------
    axes : int, tuple with int or None
        The axis or axes along which normalization is applied.
        ``None`` means that normalization will be applied over
        all axes except the first one. In case of 4D tensor it will
        be equal to ``(0, 1, 2)``. Defaults to ``None``.

    epsilon : float
        Epsilon is a positive constant that adds to the standard
        deviation to prevent the division by zero.
        Defaults to ``1e-5``.

    alpha : float
        Coefficient for the exponential moving average of
        batch-wise means and standard deviations computed during
        training; the closer to one, the more it will depend on
        the last batches seen. Value needs to be between ``0`` and ``1``.
        Defaults to ``0.1``.

    gamma : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.

    beta : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    running_mean : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    running_inv_std : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.

    {BaseLayer.Parameters}

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}

    References
    ----------
    .. [1] Batch Normalization: Accelerating Deep Network Training
           by Reducing Internal Covariate Shift,
           http://arxiv.org/pdf/1502.03167v3.pdf
    """
    axes = AxesProperty(default=None)
    epsilon = NumberProperty(default=1e-5, minval=0)
    alpha = ProperFractionProperty(default=0.1)
    beta = ParameterProperty(default=init.Constant(value=0))
    gamma = ParameterProperty(default=init.Constant(value=1))

    running_mean = ParameterProperty(default=init.Constant(value=0))
    running_inv_std = ParameterProperty(default=init.Constant(value=1))

    def initialize(self):
        super(BatchNorm, self).initialize()

        input_shape = as_tuple(None, self.input_shape)
        ndim = len(input_shape)

        if self.axes is None:
            # If ndim == 4 then axes = (0, 1, 2)
            # If ndim == 2 then axes = (0,)
            self.axes = tuple(range(ndim - 1))

        if any(axis >= ndim for axis in self.axes):
            raise ValueError("Cannot apply batch normalization on the axis "
                             "that doesn't exist.")

        opposite_axes = find_opposite_axes(self.axes, ndim)
        parameter_shape = [
            input_shape[axis] if axis in opposite_axes else 1
            for axis in range(ndim)
        ]

        if any(parameter is None for parameter in parameter_shape):
            unknown_dim_index = parameter_shape.index(None)
            raise ValueError("Cannot apply batch normalization on the axis "
                             "with unknown size over the dimension #{} "
                             "(0-based indeces).".format(unknown_dim_index))

        self.add_parameter(value=self.running_mean,
                           shape=parameter_shape,
                           name='running_mean',
                           trainable=False)
        self.add_parameter(value=self.running_inv_std,
                           shape=parameter_shape,
                           name='running_inv_std',
                           trainable=False)

        self.add_parameter(value=self.gamma,
                           name='gamma',
                           shape=parameter_shape,
                           trainable=True)
        self.add_parameter(value=self.beta,
                           name='beta',
                           shape=parameter_shape,
                           trainable=True)

    def output(self, input_value):
        alpha = asfloat(self.alpha)
        running_mean = self.running_mean
        running_inv_std = self.running_inv_std

        if not self.training_state:
            mean, inv_std = running_mean, running_inv_std
        else:
            mean = tf.reduce_mean(
                input_value,
                self.axes,
                keepdims=True,
                name="mean",
            )
            variance = tf.reduce_mean(
                tf.squared_difference(input_value, tf.stop_gradient(mean)),
                self.axes,
                keepdims=True,
                name="variance",
            )
            inv_std = tf.rsqrt(variance + asfloat(self.epsilon))

            self.updates = [
                (running_inv_std,
                 asfloat(1 - alpha) * running_inv_std + alpha * inv_std),
                (running_mean,
                 asfloat(1 - alpha) * running_mean + alpha * mean)
            ]

        normalized_value = (input_value - mean) * inv_std
        return self.gamma * normalized_value + self.beta
示例#26
0
文件: rprop.py 项目: degerli/neupy
class RPROP(StepSelectionBuiltIn, BaseGradientDescent):
    """
    Resilient backpropagation (RPROP) is an optimization
    algorithm for supervised learning.

    RPROP algorithm takes into account only direction of the gradient
    and completely ignores its magnitude. Every weight values has a unique
    step size associated with it (by default all of the are equal to ``step``).

    The rule is following, when gradient direction changes (sign of the
    gradient) we decrease step size for specific weight multiplying it by
    ``decrease_factor`` and if sign stays the same than we increase step
    size for this specific weight multiplying it by ``increase_factor``.

    The step size is always bounded by ``minstep`` and ``maxstep``.

    Notes
    -----
    Algorithm doesn't work with mini-batches.

    Parameters
    ----------
    minstep : float
        Minimum possible value for step. Defaults to ``0.001``.

    maxstep : float
        Maximum possible value for step. Defaults to ``10``.

    increase_factor : float
        Increase factor for step in case when gradient doesn't change
        sign compare to previous epoch.

    decrease_factor : float
        Decrease factor for step in case when gradient changes sign
        compare to previous epoch.

    {BaseGradientDescent.Parameters}

    Attributes
    ----------
    {BaseGradientDescent.Attributes}

    Methods
    -------
    {BaseGradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> rpropnet = algorithms.RPROP((2, 3, 1))
    >>> rpropnet.train(x_train, y_train)

    See Also
    --------
    :network:`IRPROPPlus` : iRPROP+ algorithm.
    :network:`GradientDescent` : GradientDescent algorithm.
    """

    # This properties correct upper and lower bounds for steps.
    minstep = BoundedProperty(default=0.001, minval=0)
    maxstep = BoundedProperty(default=10, minval=0)

    # This properties increase/decrease step by deviding it to
    # some coeffitient.
    increase_factor = BoundedProperty(minval=1, default=1.2)
    decrease_factor = ProperFractionProperty(default=0.5)

    def update_prev_delta(self, prev_delta):
        return prev_delta

    def init_train_updates(self):
        updates = []

        for layer, parameter, gradient in self.iter_params_and_grads():
            with tf.variable_scope(parameter.op.name):
                steps = tf.Variable(
                    # Steps will be decreased after the first iteration,
                    # because all previous gradients are equal to zero.
                    # In order to make sure that network will use the same
                    # step per every weight we re-scale step and after the
                    # first iteration it will be multiplied by
                    # ``decrease_factor`` and scaled back to the default
                    # step value.
                    tf.ones_like(parameter) * self.step,
                    name="steps",
                    dtype=tf.float32,
                )
                prev_delta = tf.Variable(
                    tf.zeros(parameter.shape),
                    name="prev-delta",
                    dtype=tf.float32,
                )
                # We collect only signs since it ensures numerical stability
                # after multiplication when we deal with small numbers.
                prev_gradient_sign = tf.Variable(
                    tf.zeros(parameter.shape),
                    name="prev-grad-sign",
                    dtype=tf.float32,
                )

            updated_prev_delta = self.update_prev_delta(prev_delta)
            gradient_sign = tf.sign(gradient)

            grad_sign_product = gradient_sign * prev_gradient_sign
            gradient_changed_sign = tf.equal(grad_sign_product, -1)

            updated_steps = tf.clip_by_value(
                tf.where(
                    tf.equal(grad_sign_product, 1),
                    steps * self.increase_factor,
                    tf.where(
                        gradient_changed_sign,
                        steps * self.decrease_factor,
                        steps,
                    )
                ),
                self.minstep,
                self.maxstep,
            )
            parameter_delta = tf.where(
                gradient_changed_sign,
                # If we subtract previous negative weight update it means
                # that we will revert weight update that has been  applied
                # in the previous iteration.
                -updated_prev_delta,
                updated_steps * gradient_sign,
            )
            # Making sure that during the next iteration sign, after
            # we multiplied by the new gradient, won't be negative.
            # Otherwise, the same roll back using previous delta
            # won't make much sense.
            clipped_gradient_sign = tf.where(
                gradient_changed_sign,
                tf.zeros_like(gradient_sign),
                gradient_sign,
            )

            updates.extend([
                (parameter, parameter - parameter_delta),
                (steps, updated_steps),
                (prev_gradient_sign, clipped_gradient_sign),
                (prev_delta, parameter_delta),
            ])

        return updates
示例#27
0
class LeakStepAdaptation(SingleStepConfigurable):
    """
    Leak Learning Rate Adaptation algorithm is a step
    adaptation procedure in backpropagation algortihm.

    Parameters
    ----------
    leak_size : float
        Defaults to ``0.01``. This variable identified
        proportion, so it's always between 0 and 1.
        Typically this value is small.

    alpha : float
        The ``alpha`` is control total step update ratio.
        Defaults to ``0.001``. Typically this value is small.

    beta : float
        This similar to ``alpha``, but it control ration
        only for update matrix norms. Defaults to ``20``.
        Typically this value is bigger than ``1``.

    Warns
    -----
    {SingleStepConfigurable.Warns}

    Examples
    --------
    >>> from neupy import algorithms
    >>> bpnet = algorithms.GradientDescent(
    ...     (2, 4, 1),
    ...     addons=[algorithms.LeakStepAdaptation]
    ... )

    References
    ----------
    [1] Noboru M. "Adaptive on-line learning in changing
        environments", 1997

    [2] LeCun, "Efficient BackProp", 1998
    """
    leak_size = ProperFractionProperty(default=0.01)
    alpha = BoundedProperty(default=0.001, minval=0)
    beta = BoundedProperty(default=20, minval=0)

    def init_variables(self):
        super(LeakStepAdaptation, self).init_variables()

        n_parameters = count_parameters(self.connection)
        self.variables.leak_average = tf.Variable(
            tf.zeros(n_parameters),
            name="leak-step-adapt/leak-average",
            dtype=tf.float32,
        )

    def init_train_updates(self):
        updates = super(LeakStepAdaptation, self).init_train_updates()

        alpha = asfloat(self.alpha)
        beta = asfloat(self.beta)
        leak_size = asfloat(self.leak_size)

        step = self.variables.step
        leak_average = self.variables.leak_average

        parameters = parameter_values(self.connection)
        gradients = tf.gradients(self.variables.error_func, parameters)
        full_gradient = tf.concat([flatten(grad) for grad in gradients],
                                  axis=0)

        leak_avarage_update = ((1 - leak_size) * leak_average +
                               leak_size * full_gradient)
        new_step = step + alpha * step * (beta * tf.norm(leak_avarage_update) -
                                          step)

        updates.extend([
            (leak_average, leak_avarage_update),
            (step, new_step),
        ])

        return updates
示例#28
0
class DropBlock(Identity):
    """
    DropBlock, a form of structured dropout, where units in a contiguous
    region of a feature map are dropped together.

    Parameters
    ----------
    keep_proba : float
        Fraction of the input units to keep. Value needs to be
        between ``0`` and ``1``.

    block_size : int or tuple
        Size of the block to be dropped. Blocks that have squared shape can
        be specified with a single integer value. For example, `block_size=5`
        the same as `block_size=(5, 5)`.

    {Identity.name}

    Methods
    -------
    {Identity.Methods}

    Attributes
    ----------
    {Identity.Attributes}

    See Also
    --------
    :layer:`Dropout` : Dropout layer.

    References
    ----------
    [1] Golnaz Ghiasi, Tsung-Yi Lin, Quoc V. Le. DropBlock: A regularization
        method for convolutional networks, 2018.

    Examples
    --------
    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((28, 28, 1)),
    ...
    ...     Convolution((3, 3, 16)) >> Relu(),
    ...     DropBlock(keep_proba=0.1, block_size=5),
    ...
    ...     Convolution((3, 3, 32)) >> Relu(),
    ...     DropBlock(keep_proba=0.1, block_size=5),
    ... )
    """
    keep_proba = ProperFractionProperty()
    block_size = TypedListProperty(n_elements=2)

    def __init__(self, keep_proba, block_size, name=None):
        super(DropBlock, self).__init__(name=name)

        if isinstance(block_size, int):
            block_size = (block_size, block_size)

        self.keep_proba = keep_proba
        self.block_size = block_size

    def get_output_shape(self, input_shape):
        input_shape = tf.TensorShape(input_shape)

        if input_shape and input_shape.ndims != 4:
            raise LayerConnectionError(
                "DropBlock layer expects input with 4 dimensions, got {} "
                "with shape {}".format(len(input_shape), input_shape))

        return input_shape

    def output(self, input, training=False):
        if not training:
            return input

        input = tf.convert_to_tensor(input, tf.float32)
        input_shape = tf.shape(input)

        block_height, block_width = self.block_size
        height, width = input_shape[1], input_shape[2]

        input_area = asfloat(width * height)
        block_area = asfloat(block_width * block_height)
        area = asfloat((width - block_width + 1) * (height - block_height + 1))

        mask = bernoulli_sample(
            mean=(1. - self.keep_proba) * input_area / (block_area * area),
            shape=[
                input_shape[0],
                height - block_height + 1,
                width - block_width + 1,
                input_shape[3],
            ],
        )

        br_height = (block_height - 1) // 2
        tl_height = (block_height - 1) - br_height

        br_width = (block_width - 1) // 2
        tl_width = (block_width - 1) - br_width

        mask = tf.pad(mask, [
            [0, 0],
            [tl_height, br_height],
            [tl_width, br_width],
            [0, 0],
        ])
        mask = tf.nn.max_pool(
            mask,
            [1, block_height, block_width, 1],
            strides=[1, 1, 1, 1],
            padding='SAME',
        )
        mask = tf.cast(1 - mask, tf.float32)

        feature_normalizer = asfloat(tf.size(mask)) / tf.reduce_sum(mask)
        return tf.multiply(input, mask) * feature_normalizer
示例#29
0
文件: leak_step.py 项目: disc5/neupy
class LeakStepAdaptation(SingleStepConfigurable):
    """ Leak Learning Rate Adaptation algorithm for step adaptation procedure
    in backpropagation algortihm. By default every layer has the same value
    as ``step`` parameter in network, but after first training epoch they
    must be different.

    Parameters
    ----------
    leak_size : float
        Defaults to ``0.01``. This variable identified proportion, so it's
        always between 0 and 1. Usualy this value is small.
    alpha : float
        The ``alpha`` is control total step update ratio (It's similar to
        step role in weight update procedure). Defaults to ``0.001``.
        Typical this value is small.
    beta : float
        This similar to ``alpha``, but it control ration only for update
        matrix norms. Defaults to ``20``.
        Typical this value is > 1.
    beta : float

    Warns
    -----
    {SingleStepConfigurable.Warns}

    Examples
    --------
    >>> from neupy import algorithms
    >>>
    >>> bpnet = algorithms.GradientDescent(
    ...     (2, 4, 1),
    ...     addons=[algorithms.LeakStepAdaptation]
    ... )
    >>>

    .. [1] Noboru M. "Adaptive on-line learning in changing
        environments", 1997
    .. [2] LeCun, "Efficient BackProp", 1998
    """
    leak_size = ProperFractionProperty(default=0.01)
    alpha = BoundedProperty(default=0.001, minval=0)
    beta = BoundedProperty(default=20, minval=0)

    def init_variables(self):
        super(LeakStepAdaptation, self).init_variables()
        n_parameters = count_parameters(self)
        self.variables.leak_average = theano.shared(value=asfloat(
            np.zeros(n_parameters)),
                                                    name='leak_average')

    def init_train_updates(self):
        updates = super(LeakStepAdaptation, self).init_train_updates()

        alpha = self.alpha
        beta = self.beta
        leak_size = self.leak_size

        step = self.variables.step
        leak_average = self.variables.leak_average

        parameters = list(iter_parameters(self))
        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        leak_avarage_update = ((1 - leak_size) * leak_average +
                               leak_size * full_gradient)
        new_step = step + alpha * step * (
            beta * leak_avarage_update.norm(L=2) - step)

        updates.extend([
            (leak_average, leak_avarage_update),
            (step, new_step),
        ])

        return updates
示例#30
0
class HessianDiagonal(NoMultipleStepSelection, GradientDescent):
    """ Hissian diagonal is a Hessian algorithm approximation which require
    only computation of hessian matrix diagonal elements and makes it
    invertion much easier and faster.

    Parameters
    ----------
    min_eigval : float
        Set up minimum eigenvalue for Hessian diagonale matrix. After a few
        iteration elements will be extremly small and matrix inverse
        produce huge number in hessian diagonal elements. This
        parameter control diagonal elements size. Defaults to ``1e-2``.
    {GradientDescent.addons}
    {ConstructableNetwork.connection}
    {ConstructableNetwork.error}
    {BaseNetwork.step}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    {SupervisedLearning.train}
    {BaseSkeleton.fit}

    Examples
    --------
    Simple example

    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> hdnet = algorithms.HessianDiagonal(
    ...     (2, 3, 1),
    ...     verbose=False
    ... )
    >>> hdnet.train(x_train, y_train)

    Diabets dataset example

    >>> import numpy as np
    >>> from sklearn.cross_validation import train_test_split
    >>> from sklearn import datasets, preprocessing
    >>> from neupy import algorithms, layers, environment
    >>> from neupy.estimators import rmsle
    >>>
    >>> environment.reproducible()
    >>>
    >>> dataset = datasets.load_diabetes()
    >>> data, target = dataset.data, dataset.target
    >>>
    >>> input_scaler = preprocessing.StandardScaler()
    >>> target_scaler = preprocessing.StandardScaler()
    >>>
    >>> x_train, x_test, y_train, y_test = train_test_split(
    ...     input_scaler.fit_transform(data),
    ...     target_scaler.fit_transform(target),
    ...     train_size=0.8
    ... )
    >>>
    >>> nw = algorithms.HessianDiagonal(
    ...     connection=[
    ...         layers.Sigmoid(10),
    ...         layers.Sigmoid(20),
    ...         layers.Output(1)
    ...     ],
    ...     step=1.5,
    ...     shuffle_data=False,
    ...     verbose=False,
    ...     min_eigval=1e-10
    ... )
    >>> nw.train(x_train, y_train, epochs=10)
    >>> y_predict = nw.predict(x_test)
    >>>
    >>> error = rmsle(target_scaler.inverse_transform(y_test),
    ...               target_scaler.inverse_transform(y_predict).round())
    >>> error
    0.50315919814691346

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    :network:`Hessian` : Newton's method.
    """
    min_eigval = ProperFractionProperty(default=1e-2)

    def init_train_updates(self):
        step = self.variables.step
        min_eigval = self.min_eigval
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        second_derivatives = []
        for parameter, gradient in zip(parameters, gradients):
            second_derivative = T.grad(gradient.sum(), wrt=parameter)
            second_derivatives.append(second_derivative.flatten())

        hessian_diag = T.concatenate(second_derivatives)
        hessian_diag = T.switch(
            T.abs_(hessian_diag) < min_eigval,
            T.switch(
                hessian_diag < 0,
                -min_eigval,
                min_eigval,
            ), hessian_diag)

        # We divide gradient by Hessian diagonal elementwise is the same
        # as we just took diagonal Hessian inverse (which is
        # reciprocal for each diagonal element) and mutliply
        # by gradient. This operation is less clear, but works faster.
        updated_parameters = (param_vector -
                              step * full_gradient / hessian_diag)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates