def test_elementwise_in_connections(self): input_layer = layers.Input(2) hidden_layer_1 = layers.Relu(1, weight=init.Constant(1), bias=init.Constant(0)) hidden_layer_2 = layers.Relu(1, weight=init.Constant(2), bias=init.Constant(0)) elem_layer = layers.Elementwise(merge_function=tf.add) connection = layers.join(input_layer, hidden_layer_1, elem_layer) connection = layers.join(input_layer, hidden_layer_2, elem_layer) connection.initialize() self.assertEqual(elem_layer.output_shape, (1, )) test_input = asfloat(np.array([ [0, 1], [-1, -1], ])) actual_output = self.eval(connection.output(test_input)) expected_output = np.array([ [3], [0], ]) np.testing.assert_array_almost_equal(expected_output, actual_output)
def test_constant_initializer(self): const = init.Constant(value=0) np.testing.assert_array_almost_equal(const.sample(shape=(2, 3)), np.zeros((2, 3))) const = init.Constant(value=1.5) np.testing.assert_array_almost_equal(const.sample(shape=(2, 3)), np.ones((2, 3)) * 1.5)
def test_compilation_multiple_inputs(self): input_matrix = asfloat(np.ones((7, 10))) expected_output = np.ones((7, 5)) network = layers.join([[ layers.Input(10), ], [ layers.Input(10), ]], layers.Elementwise(), layers.Linear(5, weight=init.Constant(0.1), bias=None)) # Generated input variables predict = network.compile() actual_output = predict(input_matrix * 0.7, input_matrix * 0.3) np.testing.assert_array_almost_equal(actual_output, expected_output) # Pre-defined input variables input_variable_1 = T.matrix('x1') input_variable_2 = T.matrix('x2') predict = network.compile(input_variable_1, input_variable_2) actual_output = predict(input_matrix * 0.7, input_matrix * 0.3) np.testing.assert_array_almost_equal(actual_output, expected_output)
class PRelu(ActivationLayer): """ The layer with the parametrized ReLu activation function. Parameters ---------- alpha_axes : int or tuple Axes that will not include unique alpha parameter. Single integer value defines the same as a tuple with one value. Defaults to ``1``. alpha : array-like, Theano shared variable, scalar or Initializer Alpha parameter per each non-shared axis for the ReLu. Scalar value means that each element in the tensor will be equal to the specified value. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0.25)``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} References ---------- .. [1] https://arxiv.org/pdf/1502.01852v1.pdf """ alpha_axes = AxesProperty(default=1) alpha = ParameterProperty(default=init.Constant(value=0.25)) def __init__(self, *args, **options): super(PRelu, self).__init__(*args, **options) if 0 in self.alpha_axes: raise ValueError("Cannot specify alpha for 0-axis") def validate(self, input_shape): if max(self.alpha_axes) > len(input_shape): max_axis_index = len(input_shape) - 1 raise ValueError("Cannot specify alpha for the axis #{}. " "Maximum available axis is #{} (0-based indeces)." "".format(max(self.alpha_axes), max_axis_index)) def initialize(self): super(PRelu, self).initialize() alpha_shape = [self.output_shape[axis - 1] for axis in self.alpha_axes] self.add_parameter(value=self.alpha, name='alpha', shape=alpha_shape, trainable=True) def activation_function(self, input_value): alpha = dimshuffle(self.alpha, input_value.ndim, self.alpha_axes) return T.nnet.relu(input_value, alpha)
def test_compilation_multiple_outputs(self): input_matrix = asfloat(np.ones((7, 10))) expected_output_1 = np.ones((7, 5)) expected_output_2 = np.ones((7, 2)) network = layers.join( layers.Input(10), [[layers.Linear(5, weight=init.Constant(0.1), bias=None)], [layers.Linear(2, weight=init.Constant(0.1), bias=None)]]) predict = network.compile() actual_output_1, actual_output_2 = predict(input_matrix) np.testing.assert_array_almost_equal(actual_output_1, expected_output_1) np.testing.assert_array_almost_equal(actual_output_2, expected_output_2)
def __init__(self, n_units=None, alpha=0, weight=init.HeNormal(gain=2), bias=init.Constant(value=0), name=None): self.alpha = alpha super(Relu, self).__init__(n_units=n_units, weight=weight, bias=bias, name=name)
def create_variable(value, name, shape, trainable=True): """ Creates NN parameter as Tensorfow variable. Parameters ---------- value : array-like, Tensorfow variable, scalar or Initializer Default value for the parameter. name : str Shared variable name. shape : tuple Parameter's shape. trainable : bool Whether parameter trainable by backpropagation. Returns ------- Tensorfow variable. """ from neupy import init if shape is not None: shape = shape_to_tuple(shape) if isinstance(value, (tf.Variable, tf.Tensor, np.ndarray, np.matrix)): variable_shape = shape_to_tuple(value.shape) if as_tuple(variable_shape) != as_tuple(shape): raise ValueError( "Cannot create variable with name `{}`. Provided variable " "with shape {} is incompatible with expected shape {}" "".format(name, variable_shape, shape)) if isinstance(value, (tf.Variable, tf.Tensor)): return value if isinstance(value, (int, float)): value = init.Constant(value) if isinstance(value, init.Initializer): value = value.sample(shape) return tf.Variable( asfloat(value), name=name, dtype=tf.float32, trainable=trainable, )
def test_gru_modify_only_one_weight_parameter(self): gru_layer = layers.GRU( 2, weights=dict(weight_in_to_updategate=init.Constant(0))) layers.join( layers.Input((5, 3)), gru_layer, ) for key, value in gru_layer.weights.items(): if key == 'weight_in_to_updategate': self.assertIsInstance(value, init.Constant) else: self.assertIsInstance(value, init.XavierUniform)
def test_linear_layer_withut_bias(self): input_layer = layers.Input(10) output_layer = layers.Linear(2, weight=init.Constant(0.1), bias=None) connection = input_layer > output_layer self.assertEqual(output_layer.bias_shape, None) input_value = asfloat(np.ones((1, 10))) actual_output = self.eval(connection.output(input_value)) expected_output = np.ones((1, 2)) np.testing.assert_array_almost_equal(expected_output, actual_output) with self.assertRaises(TypeError): layers.Linear(2, weight=None)
def test_oja_minimization(self): ojanet = algorithms.Oja(minimized_data_size=1, step=0.01, weight=init.Constant(0.1), verbose=False) ojanet.train(self.data, epsilon=1e-5, epochs=100) minimized_data = ojanet.predict(self.data) np.testing.assert_array_almost_equal(minimized_data, self.result, decimal=2) reconstructed = ojanet.reconstruct(minimized_data) np.testing.assert_array_almost_equal(reconstructed, self.data, decimal=3)
def test_simple_connection_compilation(self): input_matrix = asfloat(np.ones((7, 10))) expected_output = np.ones((7, 5)) network = layers.join( layers.Input(10), layers.Linear(5, weight=init.Constant(0.1), bias=None)) # Generated input variables predict = network.compile() actual_output = predict(input_matrix) np.testing.assert_array_almost_equal(actual_output, expected_output) # Pre-defined input variables input_variable = T.matrix('x') predict = network.compile(input_variable) actual_output = predict(input_matrix) np.testing.assert_array_almost_equal(actual_output, expected_output)
def test_layer_definitions(self): Conv = layers.Convolution.define( padding='SAME', weight=init.Constant(1), bias=None, ) network = layers.join( layers.Input((28, 28, 1)), Conv((3, 3, 16)), Conv((3, 3, 32)), ) network.create_variables() self.assertShapesEqual(network.output_shape, (None, 28, 28, 32)) weight_1 = self.eval(network.layers[1].weight) self.assertEqual(weight_1.sum(), 1 * 3 * 3 * 16) self.assertIsNone(network.layers[1].bias) weight_2 = self.eval(network.layers[2].weight) self.assertEqual(weight_2.sum(), 16 * 3 * 3 * 32) self.assertIsNone(network.layers[2].bias)
class ParameterBasedLayer(BaseLayer): """ Layer that creates weight and bias parameters. Parameters ---------- size : int Layer's output size. weight : array-like, Theano variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`XavierNormal() <neupy.init.XavierNormal>`. bias : 1D array-like, Theano variable, scalar, Initializer or None Defines layer's bias. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(0) <neupy.init.Constant>`. The ``None`` value excludes bias from the calculations and do not add it into parameters list. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} """ size = IntProperty(minval=1) weight = ParameterProperty(default=init.XavierNormal()) bias = ParameterProperty(default=init.Constant(value=0), allow_none=True) def __init__(self, size, **options): super(ParameterBasedLayer, self).__init__(size=size, **options) @property def weight_shape(self): return as_tuple(self.input_shape, self.output_shape) @property def bias_shape(self): if self.bias is not None: return as_tuple(self.output_shape) def initialize(self): super(ParameterBasedLayer, self).initialize() self.add_parameter(value=self.weight, name='weight', shape=self.weight_shape, trainable=True) if self.bias is not None: self.add_parameter(value=self.bias, name='bias', shape=self.bias_shape, trainable=True) def __repr__(self): classname = self.__class__.__name__ return '{name}({size})'.format(name=classname, size=self.size)
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} weights : dict or Initializer Weight parameters for different gates. Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`. - In case if application requires the same initialization method for all weights, then it's possible to specify initialization method that would be automaticaly applied to all weight parameters in the GRU layer. .. code-block:: python layers.GRU(2, weights=init.Normal(0.1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(weight_in_to_updategate=init.Normal(0.1)) Other parameters like ``weight_in_to_resetgate`` will be equal to their default values. biases : dict or Initializer Bias parameters for different gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. - In case if application requires the same initialization method for all biases, then it's possible to specify initialization method that would be automaticaly applied to all bias parameters in the GRU layer. .. code-block:: python layers.GRU(2, biases=init.Constant(1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(bias_resetgate=init.Constant(1)) Other parameters like ``bias_updategate`` will be equal to their default values. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import theano.tensor as T dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=T.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hid_init`` trainable variable. Defaults to ``False``. hid_init : array-like, Theano variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. precompute_input : bool if ``True``, precompute ``input_to_hid`` before iterating through the sequence. This can result in a speed up at the expense of an increase in memory usage. Defaults to ``True``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ weights = MultiParameterProperty( default=dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), )) biases = MultiParameterProperty( default=dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), )) activation_functions = MultiCallableProperty( default=dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, )) learn_init = Property(default=False, expected_type=bool) hid_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) precompute_input = Property(default=True, expected_type=bool) n_gradient_steps = IntProperty(default=-1) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) weights = self.weights biases = self.biases # Update gate parameters self.weight_in_to_updategate = self.add_parameter( value=weights.weight_in_to_updategate, name='weight_in_to_updategate', shape=(n_inputs, self.size)) self.weight_hid_to_updategate = self.add_parameter( value=weights.weight_hid_to_updategate, name='weight_hid_to_updategate', shape=(self.size, self.size)) self.bias_updategate = self.add_parameter( value=biases.bias_updategate, name='bias_updategate', shape=(self.size,)) # Reset gate parameters self.weight_in_to_resetgate = self.add_parameter( value=weights.weight_in_to_resetgate, name='weight_in_to_resetgate', shape=(n_inputs, self.size)) self.weight_hid_to_resetgate = self.add_parameter( value=weights.weight_hid_to_resetgate, name='weight_hid_to_resetgate', shape=(self.size, self.size)) self.bias_resetgate = self.add_parameter( value=biases.bias_resetgate, name='bias_forgetgate', shape=(self.size,)) # Hidden update gate parameters self.weight_in_to_hidden_update = self.add_parameter( value=weights.weight_in_to_hidden_update, name='weight_in_to_hidden_update', shape=(n_inputs, self.size)) self.weight_hid_to_hidden_update = self.add_parameter( value=weights.weight_hid_to_hidden_update, name='weight_hid_to_hidden_update', shape=(self.size, self.size)) self.bias_hidden_update = self.add_parameter( value=biases.bias_hidden_update, name='bias_hidden_update', shape=(self.size,)) self.add_parameter(value=self.hid_init, shape=(1, self.size), name="hid_init", trainable=self.learn_init) def output(self, input_value): # Treat all dimensions after the second as flattened # feature dimensions if input_value.ndim > 3: input_value = T.flatten(input_value, 3) # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = input_value.dimshuffle(1, 0, 2) seq_len, n_batch, _ = input_value.shape # Stack input weight matrices into a (num_inputs, 3 * num_units) # matrix, which speeds up computation weight_in_stacked = T.concatenate([ self.weight_in_to_updategate, self.weight_in_to_resetgate, self.weight_in_to_hidden_update], axis=1) # Same for hidden weight matrices weight_hid_stacked = T.concatenate([ self.weight_hid_to_updategate, self.weight_hid_to_resetgate, self.weight_hid_to_hidden_update], axis=1) # Stack biases into a (3 * num_units) vector bias_stacked = T.concatenate([ self.bias_updategate, self.bias_resetgate, self.bias_hidden_update], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # weight_in_stacked is (n_features, 3 * num_units). # Input: (n_time_steps, n_batch, 3 * num_units). input_value = T.dot(input_value, weight_in_stacked) + bias_stacked # When theano.scan calls step, input_n will be # (n_batch, 3 * num_units). We define a slicing function # that extract the input to each GRU gate def slice_w(x, n): s = x[:, n * self.size:(n + 1) * self.size] if self.size == 1: s = T.addbroadcast(s, 1) # Theano cannot infer this by itself return s # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(input_n, hid_previous, *args): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, weight_hid_stacked) if self.gradient_clipping: input_n = theano.gradient.grad_clip( input_n, -self.gradient_clipping, self.gradient_clipping) hid_input = theano.gradient.grad_clip( hid_input, -self.gradient_clipping, self.gradient_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, # and W_{xc}x_t + b_c input_n = T.dot(input_n, weight_in_stacked) + bias_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) resetgate = self.activation_functions.resetgate(resetgate) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) updategate = self.activation_functions.updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.gradient_clipping: hidden_update = theano.gradient.grad_clip( hidden_update, -self.gradient_clipping, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update return hid hid_init = T.dot(T.ones((n_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_sequences = [weight_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_sequences += [weight_in_stacked, bias_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer n_time_steps = self.input_shape[0] # Explicitly unroll the recurrence instead of using scan hid_out, = unroll_scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, n_steps=n_time_steps) else: # Scan op iterates over first dimension of input and # repeatedly applies the step function hid_out, _ = theano.scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, truncate_gradient=self.n_gradient_steps, strict=True) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
class LSTM(BaseRNNLayer): """ Long Short Term Memory (LSTM) Layer. Parameters ---------- {BaseRNNLayer.size} weights : dict or Initializer Weight parameters for different gates. Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`. - In case if application requires the same initialization method for all weights, then it's possible to specify initialization method that would be automaticaly applied to all weight parameters in the LSTM layer. .. code-block:: python layers.LSTM(2, weights=init.Normal(0.1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( weight_in_to_ingate=init.XavierUniform(), weight_hid_to_ingate=init.XavierUniform(), weight_cell_to_ingate=init.XavierUniform(), weight_in_to_forgetgate=init.XavierUniform(), weight_hid_to_forgetgate=init.XavierUniform(), weight_cell_to_forgetgate=init.XavierUniform(), weight_in_to_outgate=init.XavierUniform(), weight_hid_to_outgate=init.XavierUniform(), weight_cell_to_outgate=init.XavierUniform(), weight_in_to_cell=init.XavierUniform(), weight_hid_to_cell=init.XavierUniform(), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(weight_in_to_ingate=init.Normal(0.1)) Other parameters like ``weight_cell_to_outgate`` will be equal to their default values. biases : dict or Initializer Bias parameters for different gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. - In case if application requires the same initialization method for all biases, then it's possible to specify initialization method that would be automaticaly applied to all bias parameters in the LSTM layer. .. code-block:: python layers.LSTM(2, biases=init.Constant(1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( bias_ingate=init.Constant(0), bias_forgetgate=init.Constant(0), bias_cell=init.Constant(0), bias_outgate=init.Constant(0), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(bias_ingate=init.Constant(1)) Other parameters like ``bias_cell`` will be equal to their default values. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import theano.tensor as T dict( ingate=T.nnet.sigmoid, forgetgate=T.nnet.sigmoid, outgate=T.nnet.sigmoid, cell=T.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(ingate=T.tanh) Other parameters like ``forgetgate`` or ``outgate`` will be equal to their default values. learn_init : bool If ``True``, make ``cell_init`` and ``hid_init`` trainable variables. Defaults to ``False``. cell_init : array-like, Theano variable, scalar or Initializer Initializer for initial cell state (:math:`c_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. hid_init : array-like, Theano variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False`` {BaseRNNLayer.only_return_final} precompute_input : bool if ``True``, precompute ``input_to_hid`` before iterating through the sequence. This can result in a speed up at the expense of an increase in memory usage. Defaults to ``True``. peepholes : bool If ``True``, the LSTM uses peephole connections. When ``False``, cell parameters are ignored. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. gradient_clipping : flaot or int If nonzero, the gradient messages are clipped to the given value during the backward pass. Defaults to ``0``. n_gradient_steps : int Number of timesteps to include in the backpropagated gradient. If ``-1``, backpropagate through the entire sequence. Defaults to ``-1``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.LSTM(20), layers.Sigmoid(1), ] ) """ weights = MultiParameterProperty( default=dict( weight_in_to_ingate=init.XavierUniform(), weight_hid_to_ingate=init.XavierUniform(), weight_cell_to_ingate=init.XavierUniform(), weight_in_to_forgetgate=init.XavierUniform(), weight_hid_to_forgetgate=init.XavierUniform(), weight_cell_to_forgetgate=init.XavierUniform(), weight_in_to_outgate=init.XavierUniform(), weight_hid_to_outgate=init.XavierUniform(), weight_cell_to_outgate=init.XavierUniform(), weight_in_to_cell=init.XavierUniform(), weight_hid_to_cell=init.XavierUniform(), )) biases = MultiParameterProperty( default=dict( bias_ingate=init.Constant(0), bias_forgetgate=init.Constant(0), bias_cell=init.Constant(0), bias_outgate=init.Constant(0), )) activation_functions = MultiCallableProperty( default=dict( ingate=T.nnet.sigmoid, forgetgate=T.nnet.sigmoid, outgate=T.nnet.sigmoid, cell=T.tanh, )) learn_init = Property(default=False, expected_type=bool) cell_init = ParameterProperty(default=init.Constant(0)) hid_init = ParameterProperty(default=init.Constant(0)) unroll_scan = Property(default=False, expected_type=bool) backwards = Property(default=False, expected_type=bool) precompute_input = Property(default=True, expected_type=bool) peepholes = Property(default=False, expected_type=bool) n_gradient_steps = IntProperty(default=-1) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(LSTM, self).initialize() n_inputs = np.prod(self.input_shape[1:]) weights = self.weights biases = self.biases # Input gate parameters self.weight_in_to_ingate = self.add_parameter( value=weights.weight_in_to_ingate, name='weight_in_to_ingate', shape=(n_inputs, self.size)) self.weight_hid_to_ingate = self.add_parameter( value=weights.weight_hid_to_ingate, name='weight_hid_to_ingate', shape=(self.size, self.size)) self.bias_ingate = self.add_parameter( value=biases.bias_ingate, name='bias_ingate', shape=(self.size,)) # Forget gate parameters self.weight_in_to_forgetgate = self.add_parameter( value=weights.weight_in_to_forgetgate, name='weight_in_to_forgetgate', shape=(n_inputs, self.size)) self.weight_hid_to_forgetgate = self.add_parameter( value=weights.weight_hid_to_forgetgate, name='weight_hid_to_forgetgate', shape=(self.size, self.size)) self.bias_forgetgate = self.add_parameter( value=biases.bias_forgetgate, name='bias_forgetgate', shape=(self.size,)) # Cell parameters self.weight_in_to_cell = self.add_parameter( value=weights.weight_in_to_cell, name='weight_in_to_cell', shape=(n_inputs, self.size)) self.weight_hid_to_cell = self.add_parameter( value=weights.weight_hid_to_cell, name='weight_hid_to_cell', shape=(self.size, self.size)) self.bias_cell = self.add_parameter( value=biases.bias_cell, name='bias_cell', shape=(self.size,)) # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.weight_cell_to_ingate = self.add_parameter( value=weights.weight_cell_to_ingate, name='weight_cell_to_ingate', shape=(self.size,)) self.weight_cell_to_forgetgate = self.add_parameter( value=weights.weight_cell_to_forgetgate, name='weight_cell_to_forgetgate', shape=(self.size,)) self.weight_cell_to_outgate = self.add_parameter( value=weights.weight_cell_to_outgate, name='weight_cell_to_outgate', shape=(self.size,)) # Output gate parameters self.weight_in_to_outgate = self.add_parameter( value=weights.weight_in_to_outgate, name='weight_in_to_outgate', shape=(n_inputs, self.size)) self.weight_hid_to_outgate = self.add_parameter( value=weights.weight_hid_to_outgate, name='weight_hid_to_outgate', shape=(self.size, self.size)) self.bias_outgate = self.add_parameter( value=biases.bias_outgate, name='bias_outgate', shape=(self.size,)) # Initialization parameters self.add_parameter(value=self.cell_init, shape=(1, self.size), name="cell_init", trainable=self.learn_init) self.add_parameter(value=self.hid_init, shape=(1, self.size), name="hid_init", trainable=self.learn_init) def output(self, input_value): # Treat all dimensions after the second as flattened # feature dimensions if input_value.ndim > 3: input_value = T.flatten(input_value, 3) # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = input_value.dimshuffle(1, 0, 2) seq_len, n_batch, _ = input_value.shape # Stack input weight matrices into a (num_inputs, 4 * num_units) # matrix, which speeds up computation weight_in_stacked = T.concatenate([ self.weight_in_to_ingate, self.weight_in_to_forgetgate, self.weight_in_to_cell, self.weight_in_to_outgate], axis=1) # Same for hidden weight matrices weight_hid_stacked = T.concatenate([ self.weight_hid_to_ingate, self.weight_hid_to_forgetgate, self.weight_hid_to_cell, self.weight_hid_to_outgate], axis=1) # Stack biases into a (4 * num_units) vector bias_stacked = T.concatenate([ self.bias_ingate, self.bias_forgetgate, self.bias_cell, self.bias_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # weight_in_stacked is (n_features, 4 * num_units). # Input: (n_time_steps, n_batch, 4 * num_units). input_value = T.dot(input_value, weight_in_stacked) + bias_stacked # When theano.scan calls step, input_n will be # (n_batch, 4 * num_units). We define a slicing function # that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.size:(n + 1) * self.size] def one_lstm_step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, weight_in_stacked) + bias_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, weight_hid_stacked) # Clip gradients if self.gradient_clipping: gates = theano.gradient.grad_clip( gates, -self.gradient_clipping, self.gradient_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.weight_cell_to_ingate forgetgate += cell_previous * self.weight_cell_to_forgetgate # Apply nonlinearities ingate = self.activation_functions.ingate(ingate) forgetgate = self.activation_functions.forgetgate(forgetgate) cell_input = self.activation_functions.cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.weight_cell_to_outgate outgate = self.activation_functions.outgate(outgate) # Compute new hidden unit activation hid = outgate * T.tanh(cell) return [cell, hid] ones = T.ones((n_batch, 1)) cell_init = T.dot(ones, self.cell_init) hid_init = T.dot(ones, self.hid_init) non_sequences = [weight_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_sequences += [weight_in_stacked, bias_stacked] # The "peephole" weight matrices are only used # when self.peepholes=True if self.peepholes: non_sequences += [self.weight_cell_to_ingate, self.weight_cell_to_forgetgate, self.weight_cell_to_outgate] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer n_time_steps = self.input_shape[0] # Explicitly unroll the recurrence instead of using scan _, hid_out = unroll_scan( fn=one_lstm_step, sequences=[input_value], outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_sequences, n_steps=n_time_steps) else: (_, hid_out), _ = theano.scan( fn=one_lstm_step, sequences=input_value, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.n_gradient_steps, non_sequences=non_sequences, strict=True) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=tf.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hidden_init`` trainable variable. Defaults to ``False``. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) hidden_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 3 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 3 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(3 * self.size, ), ) self.add_parameter(value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(states, input_n): with tf.name_scope('gru-cell'): hid_previous, = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = tf.matmul(hid_previous, self.hidden_weights) if self.gradient_clipping != 0: input_n = clip_gradient(input_n, self.gradient_clipping) hid_input = clip_gradient(hid_input, self.gradient_clipping) hid_resetgate, hid_updategate, hid_hidden = tf.split(hid_input, 3, axis=1) in_resetgate, in_updategate, in_hidden = tf.split(input_n, 3, axis=1) # Reset and update gates resetgate = self.activation_functions.resetgate(hid_resetgate + in_resetgate) updategate = self.activation_functions.updategate( hid_updategate + in_updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update = in_hidden + resetgate * hid_hidden if self.gradient_clipping != 0: hidden_update = clip_gradient(hidden_update, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t return [ hid_previous - updategate * (hid_previous - hidden_update) ] hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=one_gru_step, sequence=sequence, outputs_info=[hidden_init]) else: hid_out, = tf.scan( fn=one_gru_step, elems=input_value, initializer=[hidden_init], name='gru-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class LSTM(BaseRNNLayer): """ Long Short Term Memory (LSTM) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. cell_weights : Initializer, ndarray Weight parameters for cell connection. Require only when ``peepholes=True`` otherwise it will be ignored. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(ingate=tf.tanh) Other parameters like ``forgetgate`` or ``outgate`` will be equal to their default values. learn_init : bool If ``True``, make ``cell_init`` and ``hidden_init`` trainable variables. Defaults to ``False``. cell_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial cell state (:math:`c_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False`` {BaseRNNLayer.only_return_final} peepholes : bool If ``True``, the LSTM uses peephole connections. When ``False``, cell parameters are ignored. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. gradient_clipping : float or int If nonzero, the gradient messages are clipped to the given value during the backward pass. Defaults to ``0``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.LSTM(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) cell_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) cell_init = ParameterProperty(default=init.Constant(0)) hidden_init = ParameterProperty(default=init.Constant(0)) unroll_scan = Property(default=False, expected_type=bool) backwards = Property(default=False, expected_type=bool) peepholes = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(LSTM, self).initialize() n_inputs = np.prod(self.input_shape[1:]) # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.weight_cell_to_ingate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_ingate', shape=(self.size, )) self.weight_cell_to_forgetgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_forgetgate', shape=(self.size, )) self.weight_cell_to_outgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_outgate', shape=(self.size, )) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 4 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 4 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(4 * self.size, ), ) # Initialization parameters self.add_parameter( value=self.cell_init, shape=(1, self.size), name="cell_init", trainable=self.learn_init, ) self.add_parameter( value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init, ) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] def one_lstm_step(states, input_n): with tf.name_scope('lstm-cell'): cell_previous, hid_previous = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Calculate gates pre-activations and slice gates = input_n + tf.matmul(hid_previous, self.hidden_weights) # Clip gradients if self.gradient_clipping != 0: gates = clip_gradient(gates, self.gradient_clipping) # Extract the pre-activation gate values ingate, forgetgate, cell_input, outgate = tf.split(gates, 4, axis=1) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.weight_cell_to_ingate forgetgate += (cell_previous * self.weight_cell_to_forgetgate) # Apply nonlinearities ingate = self.activation_functions.ingate(ingate) forgetgate = self.activation_functions.forgetgate(forgetgate) cell_input = self.activation_functions.cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.weight_cell_to_outgate outgate = self.activation_functions.outgate(outgate) # Compute new hidden unit activation hid = outgate * tf.tanh(cell) return [cell, hid] cell_init = tf.tile(self.cell_init, (n_batch, 1)) hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=one_lstm_step, sequence=sequence, outputs_info=[cell_init, hidden_init], ) else: _, hid_out = tf.scan( fn=one_lstm_step, elems=input_value, initializer=[cell_init, hidden_init], name='lstm-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class BaseStepAssociative(BaseAssociative): """ Base class for associative algorithms which have 2 layers and first one is has step function as activation. Parameters ---------- {BaseAssociative.n_inputs} {BaseAssociative.n_outputs} n_unconditioned : int Number of unconditioned units in neraul networks. All these units wouldn't update during the training procedure. Unconditioned should be the first feature in the dataset. weight : array-like Neural network weights. Value defined manualy should have shape ``(n_inputs, n_outputs)``. Defaults to ``None`` which means that all unconditional weights will be equal to ``1``. Other weights equal to ``0``. bias : array-like, Initializer Neural network bias units. Defaults to :class:`Constant(-0.5) <neupy.init.Constant>`. {BaseNetwork.Parameters} Methods ------- {BaseAssociative.Methods} """ n_inputs = IntProperty(minval=2, required=True) n_unconditioned = IntProperty(minval=1, required=True) weight = ArrayProperty() bias = ParameterProperty(default=init.Constant(-0.5)) def init_weights(self): if self.n_inputs <= self.n_unconditioned: raise ValueError( "Number of uncondition features should be less than total " "number of features. `n_inputs`={} and `n_unconditioned`={}" "".format(self.n_inputs, self.n_unconditioned)) valid_weight_shape = (self.n_inputs, self.n_outputs) valid_bias_shape = (self.n_outputs, ) if self.weight is None: self.weight = np.zeros(valid_weight_shape) self.weight[:self.n_unconditioned, :] = 1 if isinstance(self.bias, init.Initializer): self.bias = self.bias.sample(valid_bias_shape, return_array=True) super(BaseStepAssociative, self).init_weights() if self.bias.shape != valid_bias_shape: raise ValueError( "Bias vector has invalid shape. Got {}, expected {}" "".format(self.bias.shape, valid_bias_shape)) self.bias = self.bias.astype(float) def predict(self, X): X = format_data(X, is_feature1d=False) raw_output = X.dot(self.weight) + self.bias return np.where(raw_output > 0, 1, 0) def train(self, X_train, *args, **kwargs): X_train = format_data(X_train, is_feature1d=False) return super(BaseStepAssociative, self).train(X_train, *args, **kwargs) def one_training_update(self, X_train, y_train): weight = self.weight n_unconditioned = self.n_unconditioned predict = self.predict weight_delta = self.weight_delta error = 0 for x_row in X_train: x_row = np.expand_dims(x_row, axis=0) layer_output = predict(x_row) delta = weight_delta(x_row, layer_output) weight[n_unconditioned:, :] += delta # This error can tell us whether network has converged # to some value of weihts. Low errors will mean that weights # hasn't been updated much during the training epoch. error += np.linalg.norm(delta) return error
def test_constant_initialize_repr(self): const_initializer = init.Constant(value=3) self.assertEqual("Constant(3)", str(const_initializer))
def __set__(self, instance, value): if isinstance(value, number_type): value = init.Constant(value) super(ParameterProperty, self).__set__(instance, value)
class BaseStepAssociative(BaseAssociative): """ Base class for associative algorithms which have 2 layers and first one is has step function as activation. Parameters ---------- {BaseAssociative.n_inputs} {BaseAssociative.n_outputs} n_unconditioned : int Number of unconditioned units in neraul networks. All these units wouldn't update during the training procedure. Unconditioned should be the first feature in the dataset. weight : array-like Neural network weights. Value defined manualy should have shape ``(n_inputs, n_outputs)``. Defaults to ``None`` which means that all unconditional weights will be equal to ``1``. Other weights equal to ``0``. bias : array-like, Initializer Neural network bias units. Defaults to :class:`Constant(-0.5) <neupy.init.Constant>`. {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {BaseAssociative.train} {BaseSkeleton.fit} """ n_inputs = IntProperty(minval=2, required=True) n_unconditioned = IntProperty(minval=1, required=True) weight = ArrayProperty() bias = ParameterProperty(default=init.Constant(-0.5)) def init_layers(self): if self.n_inputs <= self.n_unconditioned: raise ValueError( "Number of uncondition features should be less than total " "number of features. `n_inputs`={} and " "`n_unconditioned`={}".format(self.n_inputs, self.n_unconditioned)) valid_weight_shape = (self.n_inputs, self.n_outputs) valid_bias_shape = (self.n_outputs, ) if self.weight is None: self.weight = np.zeros(valid_weight_shape) self.weight[:self.n_unconditioned, :] = 1 if isinstance(self.bias, init.Initializer): self.bias = self.bias.sample(valid_bias_shape) super(BaseStepAssociative, self).init_layers() if self.bias.shape != valid_bias_shape: raise ValueError("Bias vector has invalid shape. Got {}, " "expected {}".format(self.bias.shape, valid_bias_shape)) self.bias = self.bias.astype(float) def predict(self, input_data): input_data = format_data(input_data, is_feature1d=False) raw_output = input_data.dot(self.weight) + self.bias return np.where(raw_output > 0, 1, 0) def train(self, input_train, *args, **kwargs): input_train = format_data(input_train, is_feature1d=False) return super(BaseStepAssociative, self).train(input_train, *args, **kwargs) def train_epoch(self, input_train, target_train): weight = self.weight n_unconditioned = self.n_unconditioned predict = self.predict weight_delta = self.weight_delta for input_row in input_train: input_row = np.reshape(input_row, (1, input_row.size)) layer_output = predict(input_row) weight[n_unconditioned:, :] += weight_delta( input_row, layer_output)
class BatchNorm(BaseLayer): """ Batch-normalization layer. Parameters ---------- axes : int, tuple with int or None The axis or axes along which normalization is applied. ``None`` means that normalization will be applied over all axes except the first one. In case of 4D tensor it will be equal to ``(0, 2, 3)``. Defaults to ``None``. epsilon : float Epsilon is a positive constant that adds to the standard deviation to prevent the division by zero. Defaults to ``1e-5``. alpha : float Coefficient for the exponential moving average of batch-wise means and standard deviations computed during training; the closer to one, the more it will depend on the last batches seen. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. gamma : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. beta : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_mean : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_inv_std : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} References ---------- .. [1] Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, http://arxiv.org/pdf/1502.03167v3.pdf """ axes = AxesProperty(default=None) epsilon = NumberProperty(default=1e-5, minval=0) alpha = ProperFractionProperty(default=0.1) beta = ParameterProperty(default=init.Constant(value=0)) gamma = ParameterProperty(default=init.Constant(value=1)) running_mean = ParameterProperty(default=init.Constant(value=0)) running_inv_std = ParameterProperty(default=init.Constant(value=1)) def initialize(self): super(BatchNorm, self).initialize() input_shape = as_tuple(None, self.input_shape) ndim = len(input_shape) if self.axes is None: # If ndim == 4 then axes = (0, 2, 3) # If ndim == 2 then axes = (0,) self.axes = tuple(axis for axis in range(ndim) if axis != 1) if any(axis >= ndim for axis in self.axes): raise ValueError("Cannot apply batch normalization on the axis " "that doesn't exist.") opposite_axes = find_opposite_axes(self.axes, ndim) parameter_shape = [input_shape[axis] for axis in opposite_axes] if any(parameter is None for parameter in parameter_shape): unknown_dim_index = parameter_shape.index(None) raise ValueError("Cannot apply batch normalization on the axis " "with unknown size over the dimension #{} " "(0-based indeces).".format(unknown_dim_index)) self.add_parameter(value=self.running_mean, shape=parameter_shape, name='running_mean', trainable=False) self.add_parameter(value=self.running_inv_std, shape=parameter_shape, name='running_inv_std', trainable=False) self.add_parameter(value=self.gamma, name='gamma', shape=parameter_shape, trainable=True) self.add_parameter(value=self.beta, name='beta', shape=parameter_shape, trainable=True) def output(self, input_value): epsilon = asfloat(self.epsilon) alpha = asfloat(self.alpha) gamma, beta = self.gamma, self.beta ndim = input_value.ndim axes = self.axes running_mean = self.running_mean running_inv_std = self.running_inv_std input_mean = input_value.mean(axes) input_var = input_value.var(axes) input_inv_std = T.inv(T.sqrt(input_var + epsilon)) self.updates = [( running_inv_std, asfloat(1 - alpha) * running_inv_std + alpha * input_inv_std ), ( running_mean, asfloat(1 - alpha) * running_mean + alpha * input_mean )] if not self.training_state: mean = running_mean inv_std = running_inv_std else: mean = input_mean inv_std = input_inv_std opposite_axes = find_opposite_axes(axes, ndim) beta = dimshuffle(beta, ndim, opposite_axes) gamma = dimshuffle(gamma, ndim, opposite_axes) mean = dimshuffle(mean, ndim, opposite_axes) inv_std = dimshuffle(inv_std, ndim, opposite_axes) normalized_value = (input_value - mean) * inv_std return gamma * normalized_value + beta
class RBM(BaseAlgorithm, BaseNetwork, MinibatchTrainingMixin, DumpableObject): """ Boolean/Bernoulli Restricted Boltzmann Machine (RBM). Algorithm assumes that inputs are either binary values or values between 0 and 1. Parameters ---------- n_visible : int Number of visible units. Number of features (columns) in the input data. n_hidden : int Number of hidden units. The large the number the more information network can capture from the data, but it also mean that network is more likely to overfit. batch_size : int Size of the mini-batch. Defaults to ``10``. weight : array-like, Tensorfow variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Normal <neupy.init.Normal>`. hidden_bias : array-like, Tensorfow variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. visible_bias : array-like, Tensorfow variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. {BaseNetwork.Parameters} Methods ------- train(input_train, epochs=100) Trains network. {BaseSkeleton.fit} visible_to_hidden(visible_input) Populates data throught the network and returns output from the hidden layer. hidden_to_visible(hidden_input) Propagates output from the hidden layer backward to the visible. gibbs_sampling(visible_input, n_iter=1) Makes Gibbs sampling ``n`` times using visible input. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> data = np.array([ ... [1, 0, 1, 0], ... [1, 0, 1, 0], ... [1, 0, 0, 0], # incomplete sample ... [1, 0, 1, 0], ... ... [0, 1, 0, 1], ... [0, 0, 0, 1], # incomplete sample ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... ]) >>> >>> rbm = algorithms.RBM(n_visible=4, n_hidden=1) >>> rbm.train(data, epochs=100) >>> >>> hidden_states = rbm.visible_to_hidden(data) >>> hidden_states.round(2) array([[ 0.99], [ 0.99], [ 0.95], [ 0.99], [ 0. ], [ 0.01], [ 0. ], [ 0. ], [ 0. ], [ 0. ]]) References ---------- [1] G. Hinton, A Practical Guide to Training Restricted Boltzmann Machines, 2010. http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf """ n_visible = IntProperty(minval=1) n_hidden = IntProperty(minval=1) batch_size = IntProperty(minval=1, default=10) weight = ParameterProperty(default=init.Normal()) hidden_bias = ParameterProperty(default=init.Constant(value=0)) visible_bias = ParameterProperty(default=init.Constant(value=0)) def __init__(self, n_visible, n_hidden, **options): options.update({'n_visible': n_visible, 'n_hidden': n_hidden}) super(RBM, self).__init__(**options) def init_input_output_variables(self): with tf.variable_scope('rbm'): self.weight = create_shared_parameter(value=self.weight, name='weight', shape=(self.n_visible, self.n_hidden)) self.hidden_bias = create_shared_parameter( value=self.hidden_bias, name='hidden-bias', shape=(self.n_hidden, ), ) self.visible_bias = create_shared_parameter( value=self.visible_bias, name='visible-bias', shape=(self.n_visible, ), ) self.variables.update(network_input=tf.placeholder( tf.float32, (None, self.n_visible), name="network-input", ), network_hidden_input=tf.placeholder( tf.float32, (None, self.n_hidden), name="network-hidden-input", )) def init_variables(self): with tf.variable_scope('rbm'): self.variables.update(h_samples=tf.Variable( tf.zeros([self.batch_size, self.n_hidden]), name="hidden-samples", dtype=tf.float32, ), ) def init_methods(self): def free_energy(visible_sample): with tf.name_scope('free-energy'): wx = tf.matmul(visible_sample, self.weight) wx_b = wx + self.hidden_bias visible_bias_term = dot(visible_sample, self.visible_bias) # We can get infinity when wx_b is a relatively large number # (maybe 100). Taking exponent makes it even larger and # for with float32 it can convert it to infinity. But because # number is so large we don't care about +1 value before taking # logarithms and therefore we can just pick value as it is # since our operation won't change anything. hidden_terms = tf.where( # exp(30) is such a big number that +1 won't # make any difference in the outcome. tf.greater(wx_b, 30), wx_b, tf.log1p(tf.exp(wx_b)), ) hidden_term = tf.reduce_sum(hidden_terms, axis=1) return -(visible_bias_term + hidden_term) def visible_to_hidden(visible_sample): with tf.name_scope('visible-to-hidden'): wx = tf.matmul(visible_sample, self.weight) wx_b = wx + self.hidden_bias return tf.nn.sigmoid(wx_b) def hidden_to_visible(hidden_sample): with tf.name_scope('hidden-to-visible'): wx = tf.matmul(hidden_sample, self.weight, transpose_b=True) wx_b = wx + self.visible_bias return tf.nn.sigmoid(wx_b) def sample_hidden_from_visible(visible_sample): with tf.name_scope('sample-hidden-to-visible'): hidden_prob = visible_to_hidden(visible_sample) hidden_sample = random_binomial(hidden_prob) return hidden_sample def sample_visible_from_hidden(hidden_sample): with tf.name_scope('sample-visible-to-hidden'): visible_prob = hidden_to_visible(hidden_sample) visible_sample = random_binomial(visible_prob) return visible_sample network_input = self.variables.network_input network_hidden_input = self.variables.network_hidden_input input_shape = tf.shape(network_input) n_samples = input_shape[0] weight = self.weight h_bias = self.hidden_bias v_bias = self.visible_bias h_samples = self.variables.h_samples step = asfloat(self.step) with tf.name_scope('positive-values'): # We have to use `cond` instead of `where`, because # different if-else cases might have different shapes # and it triggers exception in tensorflow. v_pos = tf.cond( tf.equal(n_samples, self.batch_size), lambda: network_input, lambda: random_sample(network_input, self.batch_size)) h_pos = visible_to_hidden(v_pos) with tf.name_scope('negative-values'): v_neg = sample_visible_from_hidden(h_samples) h_neg = visible_to_hidden(v_neg) with tf.name_scope('weight-update'): weight_update = ( tf.matmul(v_pos, h_pos, transpose_a=True) - tf.matmul(v_neg, h_neg, transpose_a=True)) / asfloat(n_samples) with tf.name_scope('hidden-bias-update'): h_bias_update = tf.reduce_mean(h_pos - h_neg, axis=0) with tf.name_scope('visible-bias-update'): v_bias_update = tf.reduce_mean(v_pos - v_neg, axis=0) with tf.name_scope('flipped-input-features'): # Each row will have random feature marked with number 1 # Other values will be equal to 0 possible_feature_corruptions = tf.eye(self.n_visible) corrupted_features = random_sample(possible_feature_corruptions, n_samples) rounded_input = tf.round(network_input) # If we scale input values from [0, 1] range to [-1, 1] # than it will be easier to flip feature values with simple # multiplication. scaled_rounded_input = 2 * rounded_input - 1 scaled_flipped_rounded_input = ( # for corrupted_features we convert 0 to 1 and 1 to -1 # in this way after multiplication we will flip all # signs where -1 in the transformed corrupted_features (-2 * corrupted_features + 1) * scaled_rounded_input) # Scale it back to the [0, 1] range flipped_rounded_input = (scaled_flipped_rounded_input + 1) / 2 with tf.name_scope('pseudo-likelihood-loss'): # Stochastic pseudo-likelihood error = tf.reduce_mean(self.n_visible * tf.log_sigmoid( free_energy(flipped_rounded_input) - free_energy(rounded_input))) with tf.name_scope('gibbs-sampling'): gibbs_sampling = sample_visible_from_hidden( sample_hidden_from_visible(network_input)) initialize_uninitialized_variables() self.methods.update(train_epoch=function( [network_input], error, name='rbm/train-epoch', updates=[ (weight, weight + step * weight_update), (h_bias, h_bias + step * h_bias_update), (v_bias, v_bias + step * v_bias_update), (h_samples, random_binomial(p=h_neg)), ]), prediction_error=function( [network_input], error, name='rbm/prediction-error', ), diff1=function( [network_input], free_energy(flipped_rounded_input), name='rbm/diff1-error', ), diff2=function( [network_input], free_energy(rounded_input), name='rbm/diff2-error', ), visible_to_hidden=function( [network_input], visible_to_hidden(network_input), name='rbm/visible-to-hidden', ), hidden_to_visible=function( [network_hidden_input], hidden_to_visible(network_hidden_input), name='rbm/hidden-to-visible', ), gibbs_sampling=function( [network_input], gibbs_sampling, name='rbm/gibbs-sampling', )) def train(self, input_train, input_test=None, epochs=100, summary='table'): """ Train RBM. Parameters ---------- input_train : 1D or 2D array-like input_test : 1D or 2D array-like or None Defaults to ``None``. epochs : int Number of training epochs. Defaults to ``100``. summary : {'table', 'inline'} Training summary type. Defaults to ``'table'``. """ return super(RBM, self).train(input_train=input_train, target_train=None, input_test=input_test, target_test=None, epochs=epochs, epsilon=None, summary=summary) def train_epoch(self, input_train, target_train=None): """ Train one epoch. Parameters ---------- input_train : array-like (n_samples, n_features) Returns ------- float """ errors = self.apply_batches( function=self.methods.train_epoch, input_data=input_train, description='Training batches', show_error_output=True, ) n_samples = len(input_train) return average_batch_errors(errors, n_samples, self.batch_size) def visible_to_hidden(self, visible_input): """ Populates data throught the network and returns output from the hidden layer. Parameters ---------- visible_input : array-like (n_samples, n_visible_features) Returns ------- array-like """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) outputs = self.apply_batches( function=self.methods.visible_to_hidden, input_data=visible_input, description='Hidden from visible batches', show_progressbar=True, show_error_output=False, scalar_output=False, ) return np.concatenate(outputs, axis=0) def hidden_to_visible(self, hidden_input): """ Propagates output from the hidden layer backward to the visible. Parameters ---------- hidden_input : array-like (n_samples, n_hidden_features) Returns ------- array-like """ is_input_feature1d = (self.n_hidden == 1) hidden_input = format_data(hidden_input, is_input_feature1d) outputs = self.apply_batches( function=self.methods.hidden_to_visible, input_data=hidden_input, description='Visible from hidden batches', show_progressbar=True, show_error_output=False, scalar_output=False, ) return np.concatenate(outputs, axis=0) def prediction_error(self, input_data, target_data=None): """ Compute the pseudo-likelihood of input samples. Parameters ---------- input_data : array-like Values of the visible layer Returns ------- float Value of the pseudo-likelihood. """ is_input_feature1d = (self.n_visible == 1) input_data = format_data(input_data, is_input_feature1d) errors = self.apply_batches( function=self.methods.prediction_error, input_data=input_data, description='Validation batches', show_error_output=True, ) return average_batch_errors( errors, n_samples=len(input_data), batch_size=self.batch_size, ) def gibbs_sampling(self, visible_input, n_iter=1): """ Makes Gibbs sampling n times using visible input. Parameters ---------- visible_input : 1d or 2d array n_iter : int Number of Gibbs sampling iterations. Defaults to ``1``. Returns ------- array-like Output from the visible units after perfoming n Gibbs samples. Array will contain only binary units (0 and 1). """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) gibbs_sampling = self.methods.gibbs_sampling input_ = visible_input for iteration in range(n_iter): input_ = gibbs_sampling(input_) return input_
def test_initializer_get_value_exception(self): initializer = init.Constant() with self.assertRaises(init.UninitializedException): initializer.get_value()
class RBM(BaseAlgorithm, BaseNetwork, MinibatchTrainingMixin): """ Boolean/Bernoulli Restricted Boltzmann Machine (RBM). Algorithm assumes that inputs are either binary values or values between 0 and 1. Parameters ---------- n_visible : int Number of visible units. n_hidden : int Number of hidden units. {MinibatchTrainingMixin.batch_size} weight : array-like, Theano variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`XavierNormal <neupy.init.XavierNormal>`. hidden_bias : array-like, Theano variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. visible_bias : array-like, Theano variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. {BaseNetwork.Parameters} Methods ------- train(input_train, epochs=100) Trains network. {BaseSkeleton.fit} visible_to_hidden(visible_input) Populates data throught the network and returns output from the hidden layer. hidden_to_visible(hidden_input) Propagates output from the hidden layer backward to the visible. gibbs_sampling(visible_input, n_iter=1) Makes Gibbs sampling ``n`` times using visible input. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> data = np.array([ ... [1, 0, 1, 0], ... [1, 0, 1, 0], ... [1, 0, 0, 0], # incomplete sample ... [1, 0, 1, 0], ... ... [0, 1, 0, 1], ... [0, 0, 0, 1], # incomplete sample ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... ]) >>> >>> rbm = algorithms.RBM(n_visible=4, n_hidden=1) >>> rbm.train(data, epochs=100) >>> >>> hidden_states = rbm.visible_to_hidden(data) >>> hidden_states.round(2) array([[ 0.99], [ 0.99], [ 0.95], [ 0.99], [ 0. ], [ 0.01], [ 0. ], [ 0. ], [ 0. ], [ 0. ]]) References ---------- [1] G. Hinton, A Practical Guide to Training Restricted Boltzmann Machines, 2010. http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf """ n_visible = IntProperty(minval=1) n_hidden = IntProperty(minval=1) weight = ParameterProperty(default=init.XavierNormal()) hidden_bias = ParameterProperty(default=init.Constant(value=0)) visible_bias = ParameterProperty(default=init.Constant(value=0)) def __init__(self, n_visible, n_hidden, **options): self.theano_random = theano_random_stream() super(ConfigurableABC, self).__init__(n_hidden=n_hidden, n_visible=n_visible, **options) self.weight = create_shared_parameter(value=self.weight, name='algo:rbm/matrix:weight', shape=(n_visible, n_hidden)) self.hidden_bias = create_shared_parameter( value=self.hidden_bias, name='algo:rbm/vector:hidden-bias', shape=(n_hidden, ), ) self.visible_bias = create_shared_parameter( value=self.visible_bias, name='algo:rbm/vector:visible-bias', shape=(n_visible, ), ) super(RBM, self).__init__(**options) def init_input_output_variables(self): self.variables.update( network_input=T.matrix(name='algo:rbm/var:network-input'), ) def init_variables(self): self.variables.update(h_samples=theano.shared( name='algo:rbm/matrix:hidden-samples', value=asint(np.zeros((self.batch_size, self.n_hidden))), ), ) def init_methods(self): def free_energy(visible_sample): wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias visible_bias_term = T.dot(visible_sample, self.visible_bias) hidden_term = T.log(asfloat(1) + T.exp(wx_b)).sum(axis=1) return -visible_bias_term - hidden_term def visible_to_hidden(visible_sample): wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias return T.nnet.sigmoid(wx_b) def hidden_to_visible(hidden_sample): wx_b = T.dot(hidden_sample, self.weight.T) + self.visible_bias return T.nnet.sigmoid(wx_b) def sample_hidden_from_visible(visible_sample): theano_random = self.theano_random hidden_prob = visible_to_hidden(visible_sample) hidden_sample = theano_random.binomial(n=1, p=hidden_prob, dtype=theano.config.floatX) return hidden_sample def sample_visible_from_hidden(hidden_sample): theano_random = self.theano_random visible_prob = hidden_to_visible(hidden_sample) visible_sample = theano_random.binomial(n=1, p=visible_prob, dtype=theano.config.floatX) return visible_sample network_input = self.variables.network_input n_samples = asfloat(network_input.shape[0]) theano_random = self.theano_random weight = self.weight h_bias = self.hidden_bias v_bias = self.visible_bias h_samples = self.variables.h_samples step = asfloat(self.step) sample_indeces = theano_random.random_integers( low=0, high=n_samples - 1, size=(self.batch_size, )) v_pos = ifelse( T.eq(n_samples, self.batch_size), network_input, # In case if final batch has less number of # samples then expected network_input[sample_indeces]) h_pos = visible_to_hidden(v_pos) v_neg = sample_visible_from_hidden(h_samples) h_neg = visible_to_hidden(v_neg) weight_update = v_pos.T.dot(h_pos) - v_neg.T.dot(h_neg) h_bias_update = (h_pos - h_neg).mean(axis=0) v_bias_update = (v_pos - v_neg).mean(axis=0) # Stochastic pseudo-likelihood feature_index_to_flip = theano_random.random_integers( low=0, high=self.n_visible - 1, ) rounded_input = T.round(network_input) rounded_input = network_input rounded_input_flip = T.set_subtensor( rounded_input[:, feature_index_to_flip], 1 - rounded_input[:, feature_index_to_flip]) error = T.mean(self.n_visible * T.log( T.nnet.sigmoid( free_energy(rounded_input_flip) - free_energy(rounded_input)))) self.methods.update(train_epoch=theano.function( [network_input], error, name='algo:rbm/func:train-epoch', updates=[ (weight, weight + step * weight_update / n_samples), (h_bias, h_bias + step * h_bias_update), (v_bias, v_bias + step * v_bias_update), (h_samples, asint(theano_random.binomial(n=1, p=h_neg))), ]), prediction_error=theano.function( [network_input], error, name='algo:rbm/func:prediction-error', ), visible_to_hidden=theano.function( [network_input], visible_to_hidden(network_input), name='algo:rbm/func:visible-to-hidden', ), hidden_to_visible=theano.function( [network_input], hidden_to_visible(network_input), name='algo:rbm/func:hidden-to-visible', ), gibbs_sampling=theano.function( [network_input], sample_visible_from_hidden( sample_hidden_from_visible(network_input)), name='algo:rbm/func:gibbs-sampling', )) def train(self, input_train, input_test=None, epochs=100, summary='table'): """ Train RBM. Parameters ---------- input_train : 1D or 2D array-like input_test : 1D or 2D array-like or None Defaults to ``None``. epochs : int Number of training epochs. Defaults to ``100``. summary : {'table', 'inline'} Training summary type. Defaults to ``'table'``. """ return super(RBM, self).train(input_train=input_train, target_train=None, input_test=input_test, target_test=None, epochs=epochs, epsilon=None, summary=summary) def train_epoch(self, input_train, target_train=None): """ Train one epoch. Parameters ---------- input_train : array-like (n_samples, n_features) Returns ------- float """ errors = self.apply_batches( function=self.methods.train_epoch, input_data=input_train, description='Training batches', show_error_output=True, ) n_samples = len(input_train) return average_batch_errors(errors, n_samples, self.batch_size) def visible_to_hidden(self, visible_input): """ Populates data throught the network and returns output from the hidden layer. Parameters ---------- visible_input : array-like (n_samples, n_visible_features) Returns ------- array-like """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) outputs = self.apply_batches(function=self.methods.visible_to_hidden, input_data=visible_input, description='Hidden from visible batches', show_progressbar=True, show_error_output=False) return np.concatenate(outputs, axis=0) def hidden_to_visible(self, hidden_input): """ Propagates output from the hidden layer backward to the visible. Parameters ---------- hidden_input : array-like (n_samples, n_hidden_features) Returns ------- array-like """ is_input_feature1d = (self.n_hidden == 1) hidden_input = format_data(hidden_input, is_input_feature1d) outputs = self.apply_batches(function=self.methods.hidden_to_visible, input_data=hidden_input, description='Visible from hidden batches', show_progressbar=True, show_error_output=False) return np.concatenate(outputs, axis=0) def prediction_error(self, input_data, target_data=None): """ Compute the pseudo-likelihood of input samples. Parameters ---------- input_data : array-like Values of the visible layer Returns ------- float Value of the pseudo-likelihood. """ is_input_feature1d = (self.n_visible == 1) input_data = format_data(input_data, is_input_feature1d) errors = self.apply_batches( function=self.methods.prediction_error, input_data=input_data, description='Validation batches', show_error_output=True, ) return average_batch_errors(errors, n_samples=len(input_data), batch_size=self.batch_size) def gibbs_sampling(self, visible_input, n_iter=1): """ Makes Gibbs sampling n times using visible input. Parameters ---------- visible_input : 1d or 2d array n_iter : int Number of Gibbs sampling iterations. Defaults to ``1``. Returns ------- array-like Output from the visible units after perfoming n Gibbs samples. Array will contain only binary units (0 and 1). """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) gibbs_sampling = self.methods.gibbs_sampling input_ = visible_input for iteration in range(n_iter): input_ = gibbs_sampling(input_) return input_
class PRelu(ActivationLayer): """ The layer with the parametrized ReLu activation function. Parameters ---------- alpha_axes : int or tuple Axes that will not include unique alpha parameter. Single integer value defines the same as a tuple with one value. Defaults to ``-1``. alpha : array-like, Tensorfow variable, scalar or Initializer Alpha parameter per each non-shared axis for the ReLu. Scalar value means that each element in the tensor will be equal to the specified value. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0.25)``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} Examples -------- Feedforward Neural Networks (FNN) >>> from neupy.layers import * >>> network = Input(10) > PRelu(20) > PRelu(1) Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((32, 32, 3)), ... Convolution((3, 3, 16)) > PRelu(), ... Convolution((3, 3, 32)) > PRelu(), ... Reshape(), ... Softmax(10), ... ) References ---------- .. [1] https://arxiv.org/pdf/1502.01852v1.pdf """ alpha_axes = AxesProperty(default=-1) alpha = ParameterProperty(default=init.Constant(value=0.25)) def __init__(self, *args, **options): super(PRelu, self).__init__(*args, **options) if 0 in self.alpha_axes: raise ValueError("Cannot specify alpha for 0-axis") def validate(self, input_shape): if max(self.alpha_axes) > len(input_shape): max_axis_index = len(input_shape) - 1 raise ValueError("Cannot specify alpha for the axis #{}. " "Maximum available axis is {} (0-based indeces)." "".format(max(self.alpha_axes), max_axis_index)) def initialize(self): super(PRelu, self).initialize() output_shape = as_tuple(None, self.output_shape) alpha_shape = [output_shape[axis] for axis in self.alpha_axes] self.add_parameter( value=self.alpha, name='alpha', shape=alpha_shape, trainable=True, ) def activation_function(self, input_value): input_value = tf.convert_to_tensor(input_value, dtype=tf.float32) ndim = len(input_value.get_shape()) dimensions = np.arange(ndim) alpha_axes = dimensions[list(self.alpha_axes)] alpha = dimshuffle(self.alpha, ndim, alpha_axes) return tf.nn.leaky_relu(tf.to_float(input_value), tf.to_float(alpha))