class Momentum(MinibatchGradientDescent): """ Momentum algorithm. Parameters ---------- momentum : float Control previous gradient ratio. Defaults to ``0.9``. nesterov : bool Instead of classic momentum computes Nesterov momentum. Defaults to ``False``. {MinibatchGradientDescent.Parameters} Attributes ---------- {MinibatchGradientDescent.Attributes} Methods ------- {MinibatchGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Momentum((2, 3, 1)) >>> mnet.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ momentum = ProperFractionProperty(default=0.9) nesterov = Property(default=False, expected_type=bool) def init_param_updates(self, layer, parameter): step = self.variables.step parameter_shape = parameter.get_value().shape previous_velocity = theano.shared( name="{}/previous-velocity".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) velocity = self.momentum * previous_velocity - step * gradient if self.nesterov: velocity = self.momentum * velocity - step * gradient return [ (parameter, parameter + velocity), (previous_velocity, velocity), ]
class Momentum(GradientDescent): """ Momentum algorithm. Parameters ---------- momentum : float Control previous gradient ratio. Defaults to ``0.9``. nesterov : bool Instead of classic momentum computes Nesterov momentum. Defaults to ``False``. {GradientDescent.Parameters} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.Momentum(network) >>> optimizer.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ momentum = ProperFractionProperty(default=0.9) nesterov = Property(default=False, expected_type=bool) def init_train_updates(self): optimizer = tf.train.MomentumOptimizer( use_nesterov=self.nesterov, momentum=self.momentum, learning_rate=self.step, ) self.functions.optimizer = optimizer return [optimizer.minimize(self.variables.loss)]
class GlobalPooling(BaseLayer): """ Global pooling layer. Parameters ---------- function : callable Function that aggregates over dimensions. Defaults to ``theano.tensor.mean``. .. code-block:: python def agg_func(x, axis=None): pass {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} Examples -------- >>> from neupy import layers >>> >>> network = layers.join( ... layers.Input((16, 4, 4)), ... layers.GlobalPooling(), ... ) >>> network.output_shape (16,) """ function = Property(default=T.mean) @property def output_shape(self): if self.input_shape is not None: return as_tuple(self.input_shape[0]) def output(self, input_value): if input_value.ndim in (1, 2): return input_value agg_axis = range(2, input_value.ndim) return self.function(input_value, axis=list(agg_axis))
class BaseRNNLayer(BaseLayer): """ Base class for the recurrent layers Parameters ---------- n_units : int Number of hidden units in the layer. only_return_final : bool If ``True``, only return the final sequential output (e.g. for tasks where a single target value for the entire sequence is desired). In this case, Tensorfow makes an optimization which saves memory. Defaults to ``True``. {BaseLayer.name} """ n_units = IntProperty(minval=1) only_return_final = Property(expected_type=bool) def __init__(self, n_units, only_return_final=True, name=None): super(BaseRNNLayer, self).__init__(name=name) self.only_return_final = only_return_final self.n_units = n_units def fail_if_shape_invalid(self, input_shape): if input_shape and input_shape.ndims != 3: clsname = self.__class__.__name__ raise LayerConnectionError( "{} layer was expected input with three dimensions, " "but got input with {} dimensions instead. Layer: {}" "".format(clsname, input_shape.ndims, self)) def get_output_shape(self, input_shape): input_shape = tf.TensorShape(input_shape) n_samples = input_shape[0] self.fail_if_shape_invalid(input_shape) if self.only_return_final: return tf.TensorShape((n_samples, self.n_units)) n_time_steps = input_shape[1] return tf.TensorShape((n_samples, n_time_steps, self.n_units))
class BaseRNNLayer(BaseLayer): """ Base class for the recurrent layers Parameters ---------- size : int Number of hidden units in the layer. only_return_final : bool If ``True``, only return the final sequential output (e.g. for tasks where a single target value for the entire sequence is desired). In this case, Theano makes an optimization which saves memory. Defaults to ``True``. {BaseLayer.Parameters} """ size = IntProperty(minval=1) only_return_final = Property(default=True, expected_type=bool) def __init__(self, size, **kwargs): super(BaseRNNLayer, self).__init__(size=size, **kwargs) def validate(self, input_shape): n_input_dims = len(input_shape) + 1 # +1 for batch dimension clsname = self.__class__.__name__ if n_input_dims < 3: raise LayerConnectionError( "{} layer was expected input with at least three " "dimensions, got input with {} dimensions instead" "".format(clsname, n_input_dims)) @property def output_shape(self): if self.only_return_final: return as_tuple(self.size) n_time_steps = self.input_shape[0] return as_tuple(n_time_steps, self.size)
class A(Configurable): int_property = Property(expected_type=int)
class BaseNetwork(BaseSkeleton): """ Base class for Neural Network algorithms. Parameters ---------- step : float Learning rate, defaults to ``0.1``. show_epoch : int or str This property controls how often the network will display information about training. There are two main syntaxes for this property. - You can define it as a positive integer number. It defines how offen would you like to see summary output in terminal. For instance, number `100` mean that network shows summary at 100th, 200th, 300th ... epochs. - String defines number of times you want to see output in terminal. For instance, value ``'2 times'`` mean that the network will show output twice with approximately equal period of epochs and one additional output would be after the finall epoch. Defaults to ``1``. shuffle_data : bool If it's ``True`` class shuffles all your training data before training your network, defaults to ``True``. epoch_end_signal : function Calls this function when train epoch finishes. train_end_signal : function Calls this function when train process finishes. {Verbose.Parameters} Attributes ---------- errors : ErrorHistoryList Contains list of training errors. This object has the same properties as list and in addition there are three additional useful methods: `last`, `previous` and `normalized`. train_errors : ErrorHistoryList Alias to the ``errors`` attribute. validation_errors : ErrorHistoryList The same as `errors` attribute, but it contains only validation errors. last_epoch : int Value equals to the last trained epoch. After initialization it is equal to ``0``. """ step = NumberProperty(default=0.1, minval=0) show_epoch = ShowEpochProperty(minval=1, default=1) shuffle_data = Property(default=False, expected_type=bool) epoch_end_signal = Property(expected_type=types.FunctionType) train_end_signal = Property(expected_type=types.FunctionType) def __init__(self, *args, **options): self.errors = self.train_errors = ErrorHistoryList() self.validation_errors = ErrorHistoryList() self.training = AttributeKeyDict() self.last_epoch = 0 super(BaseNetwork, self).__init__(*args, **options) if self.verbose: show_network_options(self, highlight_options=options) def predict(self, input_data): """ Return prediction results for the input data. Parameters ---------- input_data : array-like Returns ------- array-like """ raise NotImplementedError def on_epoch_start_update(self, epoch): """ Function would be trigger before run all training procedure related to the current epoch. Parameters ---------- epoch : int Current epoch number. """ self.last_epoch = epoch def train_epoch(self, input_train, target_train=None): raise NotImplementedError() def prediction_error(self, input_test, target_test): raise NotImplementedError() def train(self, input_train, target_train=None, input_test=None, target_test=None, epochs=100, epsilon=None, summary='table'): """ Method train neural network. Parameters ---------- input_train : array-like target_train : array-like or None input_test : array-like or None target_test : array-like or None epochs : int Defaults to `100`. epsilon : float or None Defaults to ``None``. """ show_epoch = self.show_epoch logs = self.logs training = self.training = AttributeKeyDict() if epochs <= 0: raise ValueError("Number of epochs needs to be greater than 0.") if epsilon is not None and epochs <= 2: raise ValueError("Network should train at teast 3 epochs before " "check the difference between errors") logging_info_about_the_data(self, input_train, input_test) logging_info_about_training(self, epochs, epsilon) logs.newline() if summary == 'table': summary = SummaryTable( table_builder=table.TableBuilder( table.Column(name="Epoch #"), table.NumberColumn(name="Train err", places=4), table.NumberColumn(name="Valid err", places=4), table.TimeColumn(name="Time", width=10), stdout=logs.write ), network=self, delay_limit=1., delay_history_length=10, ) elif summary == 'inline': summary = InlineSummary(network=self) else: raise ValueError("`{}` is unknown summary type" "".format(summary)) iterepochs = create_training_epochs_iterator(self, epochs, epsilon) show_epoch = parse_show_epoch_property(self, epochs, epsilon) training.show_epoch = show_epoch # Storring attributes and methods in local variables we prevent # useless __getattr__ call a lot of times in each loop. # This variables speed up loop in case on huge amount of # iterations. training_errors = self.errors validation_errors = self.validation_errors shuffle_data = self.shuffle_data train_epoch = self.train_epoch epoch_end_signal = self.epoch_end_signal train_end_signal = self.train_end_signal on_epoch_start_update = self.on_epoch_start_update is_first_iteration = True can_compute_validation_error = (input_test is not None) last_epoch_shown = 0 ############################################# symMatrix = tt.dmatrix("symMatrix") symEigenvalues, eigenvectors = tt.nlinalg.eig(symMatrix) get_Eigen = theano.function([symMatrix], [symEigenvalues, eigenvectors]) ############################################# with logs.disable_user_input(): for epoch in iterepochs: validation_error = None epoch_start_time = time.time() on_epoch_start_update(epoch) if shuffle_data: data = shuffle(*as_tuple(input_train, target_train)) input_train, target_train = data[:-1], data[-1] try: train_error = train_epoch(input_train, target_train) print epoch name=str(self) if(name.split('(')[0]=='Hessian'): H=self.variables.hessian.get_value() ev,_=get_Eigen(H) print "positive EV ",np.sum(ev>0) print "Just zero EV", np.sum(ev==0) print "Zero EV ", np.sum(ev==0)+np.sum((ev < 0) & (ev > (np.min(ev)/2.0))) print "Neg EV ", np.sum(ev<0) print "Max EV ",np.max(ev) print "Min EV ",np.min(ev) s=str(self.itr)+'.npy' np.save(s,ev) if can_compute_validation_error: validation_error = self.prediction_error(input_test, target_test) training_errors.append(train_error) validation_errors.append(validation_error) epoch_finish_time = time.time() training.epoch_time = epoch_finish_time - epoch_start_time if epoch % training.show_epoch == 0 or is_first_iteration: summary.show_last() last_epoch_shown = epoch if epoch_end_signal is not None: epoch_end_signal(self) is_first_iteration = False except StopTraining as err: # TODO: This notification breaks table view in terminal. # I need to show it in a different way. logs.message("TRAIN", "Epoch #{} stopped. {}" "".format(epoch, str(err))) break if epoch != last_epoch_shown: summary.show_last() if train_end_signal is not None: train_end_signal(self) summary.finish() logs.newline()
class D(A): property_d = Property()
class A(object): # Doesn't have Configurable as a parent class property_a = Property()
class BaseNetwork(BaseSkeleton): """ Base class for Neural Network algorithms. Parameters ---------- step : float Learning rate, defaults to ``0.1``. show_epoch : int This property controls how often the network will display information about training. It has to be defined as positive integer. For instance, number ``100`` mean that network shows summary at 1st, 100th, 200th, 300th ... and last epochs. Defaults to ``1``. shuffle_data : bool If it's ``True`` than training data will be shuffled before the training. Defaults to ``True``. signals : dict, list or function Function that will be triggered after certain events during the training. {Verbose.Parameters} Methods ------- {BaseSkeleton.fit} predict(X) Propagates input ``X`` through the network and returns produced output. plot_errors(logx=False, show=True, **figkwargs) Using errors collected during the training this method generates plot that can give additional insight into the performance reached during the training. Attributes ---------- errors : list Information about errors. It has two main attributes, namely ``train`` and ``valid``. These attributes provide access to the training and validation errors respectively. last_epoch : int Value equals to the last trained epoch. After initialization it is equal to ``0``. n_updates_made : int Number of training updates applied to the network. """ step = NumberProperty(default=0.1, minval=0) show_epoch = IntProperty(minval=1, default=1) shuffle_data = Property(default=False, expected_type=bool) signals = Property(expected_type=object) def __init__(self, *args, **options): super(BaseNetwork, self).__init__(*args, **options) self.last_epoch = 0 self.n_updates_made = 0 self.errors = base_signals.ErrorCollector() signals = list( as_tuple( base_signals.ProgressbarSignal(), base_signals.PrintLastErrorSignal(), self.errors, self.signals, )) for i, signal in enumerate(signals): if inspect.isfunction(signal): signals[i] = base_signals.EpochEndSignal(signal) elif inspect.isclass(signal): signals[i] = signal() self.events = Events(network=self, signals=signals) def one_training_update(self, X_train, y_train=None): """ Function would be trigger before run all training procedure related to the current epoch. Parameters ---------- epoch : int Current epoch number. """ raise NotImplementedError() def score(self, X, y): raise NotImplementedError() def plot_errors(self, logx=False, show=True, **figkwargs): return plot_optimizer_errors(optimizer=self, logx=logx, show=show, **figkwargs) def train(self, X_train, y_train=None, X_test=None, y_test=None, epochs=100, batch_size=None): """ Method train neural network. Parameters ---------- X_train : array-like y_train : array-like or None X_test : array-like or None y_test : array-like or None epochs : int Defaults to ``100``. epsilon : float or None Defaults to ``None``. """ if epochs <= 0: raise ValueError("Number of epochs needs to be a positive number") epochs = int(epochs) first_epoch = self.last_epoch + 1 batch_size = batch_size or getattr(self, 'batch_size', None) self.events.trigger( name='train_start', X_train=X_train, y_train=y_train, epochs=epochs, batch_size=batch_size, store_data=False, ) try: for epoch in range(first_epoch, first_epoch + epochs): self.events.trigger('epoch_start') self.last_epoch = epoch iterator = iters.minibatches( (X_train, y_train), batch_size, self.shuffle_data, ) for X_batch, y_batch in iterator: self.events.trigger('update_start') update_start_time = time.time() train_error = self.one_training_update(X_batch, y_batch) self.n_updates_made += 1 self.events.trigger( name='train_error', value=train_error, eta=time.time() - update_start_time, epoch=epoch, n_updates=self.n_updates_made, n_samples=iters.count_samples(X_batch), store_data=True, ) self.events.trigger('update_end') if X_test is not None: test_start_time = time.time() validation_error = self.score(X_test, y_test) self.events.trigger( name='valid_error', value=validation_error, eta=time.time() - test_start_time, epoch=epoch, n_updates=self.n_updates_made, n_samples=iters.count_samples(X_test), store_data=True, ) self.events.trigger('epoch_end') except StopTraining as err: self.logs.message( "TRAIN", "Epoch #{} was stopped. Message: {}".format(epoch, str(err))) self.events.trigger('train_end')
class B(A): property_b = Property()
def test_property_repr_with_name(self): prop = Property(default=3) prop.name = 'test' self.assertEqual('Property(name="test")', repr(prop))
class DiscreteHopfieldNetwork(DiscreteMemory): """ Discrete Hopfield Network. It can memorize binary samples and reconstruct them from corrupted samples. Notes ----- - Works only with binary data. Input matrix should contain only zeros and ones. Parameters ---------- {DiscreteMemory.mode} {DiscreteMemory.n_times} check_limit : bool Option enable a limit of patterns control for the network using logarithmically proportion rule. Defaults to ``True``. .. math:: \\frac{{n_{{features}}}}{{2 \\cdot log_{{e}}(n_{{features}})}} Methods ------- energy(input_data) Compute Discrete Hopfield Energy. train(input_data) Save input data pattern into the network memory. predict(input_data, n_times=None) Recover data from the memory using input pattern. For the prediction procedure you can control number of iterations. If you set up this value equal to ``None`` then the value would be equal to the value that you set up for the property with the same name - ``n_times``. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> def draw_bin_image(image_matrix): ... for row in image_matrix.tolist(): ... print('| ' + ' '.join(' *'[val] for val in row)) ... >>> zero = np.matrix([ ... 0, 1, 1, 1, 0, ... 1, 0, 0, 0, 1, ... 1, 0, 0, 0, 1, ... 1, 0, 0, 0, 1, ... 1, 0, 0, 0, 1, ... 0, 1, 1, 1, 0 ... ]) >>> >>> one = np.matrix([ ... 0, 1, 1, 0, 0, ... 0, 0, 1, 0, 0, ... 0, 0, 1, 0, 0, ... 0, 0, 1, 0, 0, ... 0, 0, 1, 0, 0, ... 0, 0, 1, 0, 0 ... ]) >>> >>> two = np.matrix([ ... 1, 1, 1, 0, 0, ... 0, 0, 0, 1, 0, ... 0, 0, 0, 1, 0, ... 0, 1, 1, 0, 0, ... 1, 0, 0, 0, 0, ... 1, 1, 1, 1, 1, ... ]) >>> >>> half_zero = np.matrix([ ... 0, 1, 1, 1, 0, ... 1, 0, 0, 0, 1, ... 1, 0, 0, 0, 1, ... 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, ... ]) >>> >>> draw_bin_image(zero.reshape((6, 5))) | * * * | * * | * * | * * | * * | * * * >>> draw_bin_image(half_zero.reshape((6, 5))) | * * * | * * | * * | | | >>> data = np.concatenate([zero, one, two], axis=0) >>> >>> dhnet = algorithms.DiscreteHopfieldNetwork() >>> dhnet.train(data) >>> >>> result = dhnet.predict(half_zero) >>> draw_bin_image(result.reshape((6, 5))) | * * * | * * | * * | * * | * * | * * * See Also -------- :ref:`password-recovery`: Password recovery with Discrete Hopfield Network. :ref:`discrete-hopfield-network`: Discrete Hopfield Network article. """ check_limit = Property(default=True, expected_type=bool) def __init__(self, **options): super(DiscreteHopfieldNetwork, self).__init__(**options) self.n_memorized_samples = 0 def train(self, input_data): self.discrete_validation(input_data) input_data = bin2sign(input_data) input_data = format_data(input_data, is_feature1d=False) n_rows, n_features = input_data.shape n_rows_after_update = self.n_memorized_samples + n_rows if self.check_limit: memory_limit = math.ceil(n_features / (2 * math.log(n_features))) if n_rows_after_update > memory_limit: raise ValueError("You can't memorize more than {0} " "samples".format(memory_limit)) weight_shape = (n_features, n_features) if self.weight is None: self.weight = np.zeros(weight_shape, dtype=int) if self.weight.shape != weight_shape: n_features_expected = self.weight.shape[1] raise ValueError("Input data has invalid number of features. " "Got {} features instead of {}." "".format(n_features, n_features_expected)) self.weight = input_data.T.dot(input_data) np.fill_diagonal(self.weight, np.zeros(len(self.weight))) self.n_memorized_samples = n_rows_after_update def predict(self, input_data, n_times=None): self.discrete_validation(input_data) input_data = format_data(bin2sign(input_data), is_feature1d=False) if self.mode == 'async': if n_times is None: n_times = self.n_times _, n_features = input_data.shape output_data = input_data for _ in range(n_times): position = np.random.randint(0, n_features - 1) raw_new_value = output_data.dot(self.weight[:, position]) output_data[:, position] = np.sign(raw_new_value) else: output_data = input_data.dot(self.weight) return step_function(output_data).astype(int) def energy(self, input_data): self.discrete_validation(input_data) input_data = bin2sign(input_data) input_data = format_data(input_data, is_feature1d=False) n_rows, n_features = input_data.shape if n_rows == 1: return hopfield_energy(self.weight, input_data, input_data) output = np.zeros(n_rows) for i, row in enumerate(input_data): output[i] = hopfield_energy(self.weight, row, row) return output
def test_property_get_method(self): prop = Property(default=3) self.assertEqual(None, prop.__get__(None, None))
def test_property_repr(self): prop = Property(default=3) self.assertEqual('Property()', repr(prop))
class A(Configurable): required_prop = Property(required=True)
class B(Configurable): int_property = Property(expected_type=(str, set))
class A(Configurable): prop = Property(default=3)
class BaseLayer(BaseConnection, Configurable): """ Base class for all layers. Parameters ---------- name : str or None Layer's identifier. If name is equal to ``None`` than name will be generated automatically. Defaults to ``None``. Methods ------- disable_training_state() Swith off trainig state. initialize() Set up important configurations related to the layer. Attributes ---------- input_shape : tuple Layer's input shape. output_shape : tuple Layer's output shape. training_state : bool Defines whether layer in training state or not. parameters : dict Trainable parameters. graph : LayerGraph instance Graphs that stores all relations between layers. """ name = Property(expected_type=six.string_types) # Stores global identifier index for each layer class global_identifiers_map = {} def __new__(cls, *args, **kwargs): if cls not in cls.global_identifiers_map: cls.global_identifiers_map[cls] = 1 return super(BaseLayer, cls).__new__(cls) def __init__(self, *args, **options): super(BaseLayer, self).__init__(*args) self.updates = [] self.parameters = OrderedDict() self.name = generate_layer_name(layer=self) self.input_shape_ = None self.graph.add_layer(self) Configurable.__init__(self, **options) def validate(self, input_shape): """ Validate input shape value before assigning it. Parameters ---------- input_shape : tuple with int """ @property def input_shape(self): return self.input_shape_ @input_shape.setter def input_shape(self, shape): self.validate(shape) self.input_shape_ = shape @property def output_shape(self): return self.input_shape def output(self, input_value): return input_value def add_parameter(self, value, name, shape=None, trainable=True): theano_name = 'layer:{layer_name}/{parameter_name}'.format( layer_name=self.name, parameter_name=name.replace('_', '-')) parameter = create_shared_parameter(value, theano_name, shape) parameter.trainable = trainable self.parameters[name] = parameter setattr(self, name, parameter) return parameter def __repr__(self): classname = self.__class__.__name__ return '{name}()'.format(name=classname)
class LSTM(BaseRNNLayer): """ Long Short Term Memory (LSTM) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. cell_weights : Initializer, ndarray Weight parameters for cell connection. Require only when ``peepholes=True`` otherwise it will be ignored. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(ingate=tf.tanh) Other parameters like ``forgetgate`` or ``outgate`` will be equal to their default values. learn_init : bool If ``True``, make ``cell_init`` and ``hidden_init`` trainable variables. Defaults to ``False``. cell_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial cell state (:math:`c_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False`` {BaseRNNLayer.only_return_final} peepholes : bool If ``True``, the LSTM uses peephole connections. When ``False``, cell parameters are ignored. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. gradient_clipping : float or int If nonzero, the gradient messages are clipped to the given value during the backward pass. Defaults to ``0``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.LSTM(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) cell_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) cell_init = ParameterProperty(default=init.Constant(0)) hidden_init = ParameterProperty(default=init.Constant(0)) unroll_scan = Property(default=False, expected_type=bool) backwards = Property(default=False, expected_type=bool) peepholes = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(LSTM, self).initialize() n_inputs = np.prod(self.input_shape[1:]) # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.weight_cell_to_ingate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_ingate', shape=(self.size, )) self.weight_cell_to_forgetgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_forgetgate', shape=(self.size, )) self.weight_cell_to_outgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_outgate', shape=(self.size, )) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 4 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 4 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(4 * self.size, ), ) # Initialization parameters self.add_parameter( value=self.cell_init, shape=(1, self.size), name="cell_init", trainable=self.learn_init, ) self.add_parameter( value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init, ) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] def one_lstm_step(states, input_n): with tf.name_scope('lstm-cell'): cell_previous, hid_previous = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Calculate gates pre-activations and slice gates = input_n + tf.matmul(hid_previous, self.hidden_weights) # Clip gradients if self.gradient_clipping != 0: gates = clip_gradient(gates, self.gradient_clipping) # Extract the pre-activation gate values ingate, forgetgate, cell_input, outgate = tf.split(gates, 4, axis=1) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.weight_cell_to_ingate forgetgate += (cell_previous * self.weight_cell_to_forgetgate) # Apply nonlinearities ingate = self.activation_functions.ingate(ingate) forgetgate = self.activation_functions.forgetgate(forgetgate) cell_input = self.activation_functions.cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.weight_cell_to_outgate outgate = self.activation_functions.outgate(outgate) # Compute new hidden unit activation hid = outgate * tf.tanh(cell) return [cell, hid] cell_init = tf.tile(self.cell_init, (n_batch, 1)) hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=one_lstm_step, sequence=sequence, outputs_info=[cell_init, hidden_init], ) else: _, hid_out = tf.scan( fn=one_lstm_step, elems=input_value, initializer=[cell_init, hidden_init], name='lstm-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class LVQ(BaseNetwork): """ Learning Vector Quantization (LVQ) algorithm. Notes ----- - Input data needs to be normalized, because LVQ uses Euclidian distance to find clusters. - Training error is just a ratio of miscassified samples Parameters ---------- n_inputs : int Number of input units. It should be equal to the number of features in the input data set. n_subclasses : int, None Defines total number of subclasses. Values should be greater or equal to the number of classes. ``None`` will set up number of subclasses equal to the number of classes. Defaults to ``None`` (or the same as ``n_classes``). n_classes : int Number of classes in the data set. prototypes_per_class : list, None Defines number of prototypes per each class. For instance, if ``n_classes=3`` and ``n_subclasses=8`` then there are can be 3 subclasses for the first class, 3 for the second one and 2 for the third one (3 + 3 + 2 == 8). The following example can be specified as ``prototypes_per_class=[3, 3, 2]``. There are two rules that apply to this parameter: 1. ``sum(prototypes_per_class) == n_subclasses`` 2. ``len(prototypes_per_class) == n_classes`` The ``None`` value will distribute approximately equal number of subclasses per each class. It's approximately, because in casses when ``n_subclasses % n_classes != 0`` there is no way to distribute equal number of subclasses per each class. Defaults to ``None``. {BaseNetwork.step} n_updates_to_stepdrop : int or None If this options is not equal to ``None`` then after every update LVQ reduces step size and do it until number of applied updates would reach the ``n_updates_to_stepdrop`` value. The minimum possible step size defined in the ``minstep`` parameter. Be aware that number of updates is not the same as number of epochs. LVQ applies update after each propagated sample through the network. Relations between this parameter and maximum number of epochs is following .. code-block:: python n_updates_to_stepdrop = n_samples * n_max_epochs If parameter equal to ``None`` then step size wouldn't be reduced after each update. Defaults to ``None``. minstep : float Step size would never be lower than this value. This property useful only in case if ``n_updates_to_stepdrop`` is not ``None``. Defaults to ``1e-5``. {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {BaseSkeleton.fit} """ n_inputs = IntProperty(minval=1) n_subclasses = IntProperty(minval=2, default=None, allow_none=True) n_classes = IntProperty(minval=2) prototypes_per_class = TypedListProperty(allow_none=True, default=None) weight = Property(expected_type=(np.ndarray, init.Initializer), allow_none=True, default=None) n_updates_to_stepdrop = IntProperty(default=None, allow_none=True, minval=1) minstep = NumberProperty(minval=0, default=1e-5) def __init__(self, **options): self.initialized = False super(LVQ, self).__init__(**options) self.n_updates = 0 if self.n_subclasses is None: self.n_subclasses = self.n_classes if isinstance(self.weight, init.Initializer): weight_shape = (self.n_inputs, self.n_subclasses) self.weight = self.weight.sample(weight_shape) if self.weight is not None: self.initialized = True if self.n_subclasses < self.n_classes: raise ValueError("Number of subclasses should be greater " "or equal to the number of classes. Network " "was defined with {} subclasses and {} classes" "".format(self.n_subclasses, self.n_classes)) if self.prototypes_per_class is None: whole, reminder = divmod(self.n_subclasses, self.n_classes) self.prototypes_per_class = [whole] * self.n_classes if reminder: # Since we have reminder left, it means that we cannot # have an equal number of subclasses per each class, # therefor we will add +1 to randomly selected class. class_indeces = np.random.choice(self.n_classes, reminder, replace=False) for class_index in class_indeces: self.prototypes_per_class[class_index] += 1 if len(self.prototypes_per_class) != self.n_classes: raise ValueError("LVQ defined for classification problem that has " "{} classes, but the `prototypes_per_class` " "variable has defined data for {} classes." "".format(self.n_classes, len(self.prototypes_per_class))) if sum(self.prototypes_per_class) != self.n_subclasses: raise ValueError("Invalid distribution of subclasses for the " "`prototypes_per_class` variable. Got total " "of {} subclasses ({}) instead of {} expected" "".format(sum(self.prototypes_per_class), self.prototypes_per_class, self.n_subclasses)) self.subclass_to_class = [] for class_id, n_prototypes in enumerate(self.prototypes_per_class): self.subclass_to_class.extend([class_id] * n_prototypes) @property def training_step(self): if self.n_updates_to_stepdrop is None: return self.step updates_ratio = (1 - self.n_updates / self.n_updates_to_stepdrop) return self.minstep + (self.step - self.minstep) * updates_ratio def predict(self, input_data): if not self.initialized: raise NotTrained("LVQ network hasn't been trained yet") input_data = format_data(input_data) subclass_to_class = self.subclass_to_class weight = self.weight predictions = [] for input_row in input_data: output = euclid_distance(input_row, weight) winner_subclass = int(output.argmin(axis=1)) predicted_class = subclass_to_class[winner_subclass] predictions.append(predicted_class) return np.array(predictions) def train(self, input_train, target_train, *args, **kwargs): input_train = format_data(input_train) target_train = format_data(target_train) n_input_samples = len(input_train) if n_input_samples <= self.n_subclasses: raise ValueError("Number of training input samples should be " "greater than number of sublcasses. Training " "method recived {} input samples." "".format(n_input_samples)) if not self.initialized: target_classes = sorted(np.unique(target_train).astype(np.int)) expected_classes = list(range(self.n_classes)) if target_classes != expected_classes: raise ValueError("All classes should be integers from the " "range [0, {}], but got the following " "classes instead {}".format( self.n_classes - 1, target_classes)) weights = [] iterator = zip(target_classes, self.prototypes_per_class) for target_class, n_prototypes in iterator: is_valid_class = (target_train[:, 0] == target_class) is_valid_class = is_valid_class.astype('float64') n_samples_per_class = sum(is_valid_class) is_valid_class /= n_samples_per_class if n_samples_per_class <= n_prototypes: raise ValueError("Input data has {0} samples for class-{1}" ". Number of samples per specified " "class-{1} should be greater than {2}." "".format(n_samples_per_class, target_class, n_prototypes)) class_weight_indeces = np.random.choice( np.arange(n_input_samples), n_prototypes, replace=False, p=is_valid_class) class_weight = input_train[class_weight_indeces] weights.extend(class_weight) self.weight = np.array(weights) self.initialized = True super(LVQ, self).train(input_train, target_train, *args, **kwargs) def train_epoch(self, input_train, target_train): weight = self.weight subclass_to_class = self.subclass_to_class n_correct_predictions = 0 for input_row, target in zip(input_train, target_train): step = self.training_step output = euclid_distance(input_row, weight) winner_subclass = int(output.argmin()) predicted_class = subclass_to_class[winner_subclass] weight_update = input_row - weight[winner_subclass, :] is_correct_prediction = (predicted_class == target) if is_correct_prediction: weight[winner_subclass, :] += step * weight_update else: weight[winner_subclass, :] -= step * weight_update n_correct_predictions += is_correct_prediction self.n_updates += 1 n_samples = len(input_train) return 1 - n_correct_predictions / n_samples
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=tf.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hidden_init`` trainable variable. Defaults to ``False``. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) hidden_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 3 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 3 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(3 * self.size, ), ) self.add_parameter(value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(states, input_n): with tf.name_scope('gru-cell'): hid_previous, = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = tf.matmul(hid_previous, self.hidden_weights) if self.gradient_clipping != 0: input_n = clip_gradient(input_n, self.gradient_clipping) hid_input = clip_gradient(hid_input, self.gradient_clipping) hid_resetgate, hid_updategate, hid_hidden = tf.split(hid_input, 3, axis=1) in_resetgate, in_updategate, in_hidden = tf.split(input_n, 3, axis=1) # Reset and update gates resetgate = self.activation_functions.resetgate(hid_resetgate + in_resetgate) updategate = self.activation_functions.updategate( hid_updategate + in_updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update = in_hidden + resetgate * hid_hidden if self.gradient_clipping != 0: hidden_update = clip_gradient(hidden_update, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t return [ hid_previous - updategate * (hid_previous - hidden_update) ] hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=one_gru_step, sequence=sequence, outputs_info=[hidden_init]) else: hid_out, = tf.scan( fn=one_gru_step, elems=input_value, initializer=[hidden_init], name='gru-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class BaseOptimizer(BaseNetwork): """ Gradient descent algorithm. Parameters ---------- network : list, tuple or LayerConnection instance Network's architecture. There are a few ways to define it. - List of layers. For instance, ``[Input(2), Tanh(4), Relu(1)]``. - Constructed layers. For instance, ``Input(2) >> Tanh(4) >> Relu(1)``. regularizer : function or None Network's regularizer. loss : str or function Error/loss function. Defaults to ``mse``. - ``mae`` - Mean Absolute Error. - ``mse`` - Mean Squared Error. - ``rmse`` - Root Mean Squared Error. - ``msle`` - Mean Squared Logarithmic Error. - ``rmsle`` - Root Mean Squared Logarithmic Error. - ``categorical_crossentropy`` - Categorical cross entropy. - ``binary_crossentropy`` - Binary cross entropy. - ``binary_hinge`` - Binary hinge entropy. - ``categorical_hinge`` - Categorical hinge entropy. - Custom function which accepts two mandatory arguments. The first one is expected value and the second one is predicted value. Example: .. code-block:: python def custom_func(expected, predicted): return expected - predicted step : float, Variable Learning rate, defaults to ``0.1``. {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.signals} {BaseNetwork.verbose} Attributes ---------- {BaseNetwork.Attributes} Methods ------- {BaseSkeleton.predict} train(X_train, y_train, X_test=None, y_test=None, epochs=100) Train network. You can control network's training procedure with ``epochs`` parameter. The ``X_test`` and ``y_test`` should be presented both in case network's validation required after each training epoch. {BaseSkeleton.fit} """ step = ScalarVariableProperty(default=0.1) target = Property(default=None, allow_none=True) regularizer = Property(default=None, allow_none=True) loss = FunctionWithOptionsProperty(default='mse', choices={ 'mae': objectives.mae, 'mse': objectives.mse, 'rmse': objectives.rmse, 'msle': objectives.msle, 'rmsle': objectives.rmsle, 'binary_crossentropy': objectives.binary_crossentropy, 'categorical_crossentropy': objectives.categorical_crossentropy, 'binary_hinge': objectives.binary_hinge, 'categorical_hinge': objectives.categorical_hinge, }) def __init__(self, network, options=None, **kwargs): options = options or kwargs if isinstance(network, (list, tuple)): network = layers.join(*network) self.network = network if len(self.network.output_layers) != 1: n_outputs = len(network.output_layers) raise InvalidConnection("Connection should have one output " "layer, got {}".format(n_outputs)) target = options.get('target') if target is not None and isinstance(target, (list, tuple)): options['target'] = tf.placeholder(tf.float32, shape=target) self.target = self.network.targets super(BaseOptimizer, self).__init__(**options) start_init_time = time.time() self.logs.message("TENSORFLOW", "Initializing Tensorflow variables and functions.") self.variables = AttributeKeyDict() self.functions = AttributeKeyDict() self.network.outputs self.init_functions() self.logs.message( "TENSORFLOW", "Initialization finished successfully. It took {:.2f} seconds" "".format(time.time() - start_init_time)) def init_train_updates(self): raise NotImplementedError() def init_functions(self): loss = self.loss(self.target, self.network.outputs) val_loss = self.loss(self.target, self.network.training_outputs) if self.regularizer is not None: loss += self.regularizer(self.network) self.variables.update( step=self.step, loss=loss, val_loss=val_loss, ) with tf.name_scope('training-updates'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): training_updates = self.init_train_updates() training_updates.extend(update_ops) tf_utils.initialize_uninitialized_variables() self.functions.update( predict=tf_utils.function(inputs=as_tuple(self.network.inputs), outputs=self.network.outputs, name='optimizer/predict'), one_training_update=tf_utils.function( inputs=as_tuple(self.network.inputs, self.target), outputs=loss, updates=training_updates, name='optimizer/one-update-step'), score=tf_utils.function(inputs=as_tuple(self.network.inputs, self.target), outputs=val_loss, name='optimizer/score')) def format_input(self, X): X = as_tuple(X) X_formatted = [] if len(X) != len(self.network.input_layers): raise ValueError("Number of inputs doesn't match number " "of input layers in the network.") for input, input_layer in zip(X, self.network.input_layers): input_shape = tf.TensorShape(input_layer.input_shape) is_feature1d = (input_shape.ndims == 2 and input_shape[1] == 1) formatted_input = format_data(input, is_feature1d=is_feature1d) if (formatted_input.ndim + 1) == input_shape.ndims: # We assume that when one dimension was missed than user # wants to propagate single sample through the network formatted_input = np.expand_dims(formatted_input, axis=0) X_formatted.append(formatted_input) return X_formatted def format_target(self, y): output_shape = tf.TensorShape(self.network.output_shape) is_feature1d = (output_shape.ndims == 2 and output_shape[1] == 1) formatted_target = format_data(y, is_feature1d=is_feature1d) if (formatted_target.ndim + 1) == len(output_shape): # We assume that when one dimension was missed than user # wants to propagate single sample through the network formatted_target = np.expand_dims(formatted_target, axis=0) return formatted_target def score(self, X, y): """ Calculate prediction accuracy for input data. Parameters ---------- X : array-like y : array-like Returns ------- float Prediction error. """ X = self.format_input(X) y = self.format_target(y) return self.functions.score(*as_tuple(X, y)) def predict(self, *X, **kwargs): """ Makes a raw prediction. Parameters ---------- X : array-like Returns ------- array-like """ default_batch_size = getattr(self, 'batch_size', None) predict_kwargs = dict( batch_size=kwargs.pop('batch_size', default_batch_size), verbose=self.verbose, ) # We require do to this check for python 2 compatibility if kwargs: raise TypeError("Unknown arguments: {}".format(kwargs)) return self.network.predict(*self.format_input(X), **predict_kwargs) def train(self, X_train, y_train, X_test=None, y_test=None, *args, **kwargs): is_test_data_partialy_missing = ( (X_test is None and y_test is not None) or (X_test is not None and y_test is None)) if is_test_data_partialy_missing: raise ValueError("Input or target test samples are missed. They " "must be defined together or none of them.") X_train = self.format_input(X_train) y_train = self.format_target(y_train) if X_test is not None: X_test = self.format_input(X_test) y_test = self.format_target(y_test) return super(BaseOptimizer, self).train(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, *args, **kwargs) def one_training_update(self, X_train, y_train): return self.functions.one_training_update(*as_tuple(X_train, y_train)) def get_params(self, deep=False, with_network=True): params = super(BaseOptimizer, self).get_params() if with_network: params['network'] = self.network return params def __reduce__(self): parameters = self.get_params(with_network=False) # We only need to know placeholders shape # in order to be able to reconstruct it parameters['target'] = tf_utils.shape_to_tuple( parameters['target'].shape) args = (self.network, parameters) return (self.__class__, args) def __repr__(self): return "{}({}, {})".format(self.__class__.__name__, self.network, self.repr_options())
class BaseNetwork(BaseSkeleton): """ Base class for Neural Network algorithms. Parameters ---------- step : float Learning rate, defaults to ``0.1``. show_epoch : int or str This property controls how often the network will display information about training. There are two main syntaxes for this property. You can describe it as positive integer number and it will describe how offen would you like to see summary output in terminal. For instance, number `100` mean that network will show you summary in 100, 200, 300 ... epochs. String value should be in a specific format. It should contain the number of times that the output will be displayed in the terminal. The second part is just a syntax word ``time`` or ``times`` just to make text readable. For instance, value ``'2 times'`` mean that the network will show output twice with approximately equal period of epochs and one additional output would be after the finall epoch. Defaults to ``1``. shuffle_data : bool If it's ``True`` class shuffles all your training data before training your network, defaults to ``True``. epoch_end_signal : function Calls this function when train epoch finishes. train_end_signal : function Calls this function when train process finishes. {Verbose.verbose} Attributes ---------- errors : ErrorHistoryList Contains list of training errors. This object has the same properties as list and in addition there are three additional useful methods: `last`, `previous` and `normalized`. train_errors : ErrorHistoryList Alias to `errors` attribute. validation_errors : ErrorHistoryList The same as `errors` attribute, but it contains only validation errors. last_epoch : int Value equals to the last trained epoch. After initialization it is equal to ``0``. """ step = NumberProperty(default=0.1, minval=0) show_epoch = ShowEpochProperty(minval=1, default=1) shuffle_data = Property(default=False, expected_type=bool) epoch_end_signal = Property(expected_type=types.FunctionType) train_end_signal = Property(expected_type=types.FunctionType) def __init__(self, *args, **options): self.errors = self.train_errors = ErrorHistoryList() self.validation_errors = ErrorHistoryList() self.training = AttributeKeyDict() self.last_epoch = 0 super(BaseNetwork, self).__init__(*args, **options) self.init_properties() if self.verbose: show_network_options(self, highlight_options=options) def init_properties(self): """ Setup default values before populate the options. """ def predict(self, input_data): """ Return prediction results for the input data. Output result includes post-processing step related to the final layer that transforms output to convenient format for end-use. Parameters ---------- input_data : array-like Returns ------- array-like """ def on_epoch_start_update(self, epoch): """ Function would be trigger before run all training procedure related to the current epoch. Parameters ---------- epoch : int Current epoch number. """ self.last_epoch = epoch def train_epoch(self, input_train, target_train=None): raise NotImplementedError() def prediction_error(self, input_test, target_test): raise NotImplementedError() def train(self, input_train, target_train=None, input_test=None, target_test=None, epochs=100, epsilon=None, summary_type='table'): """ Method train neural network. Parameters ---------- input_train : array-like target_train : array-like or Npne input_test : array-like or None target_test : array-like or None epochs : int Defaults to `100`. epsilon : float or None Defaults to ``None``. """ show_epoch = self.show_epoch logs = self.logs training = self.training = AttributeKeyDict() if epochs <= 0: raise ValueError("Number of epochs needs to be greater than 0.") if epsilon is not None and epochs <= 2: raise ValueError("Network should train at teast 3 epochs before " "check the difference between errors") if summary_type == 'table': logging_info_about_the_data(self, input_train, input_test) logging_info_about_training(self, epochs, epsilon) logs.newline() summary = SummaryTable( table_builder=table.TableBuilder( table.Column(name="Epoch #"), table.NumberColumn(name="Train err"), table.NumberColumn(name="Valid err"), table.TimeColumn(name="Time", width=10), stdout=logs.write ), network=self, delay_limit=1., delay_history_length=10, ) elif summary_type == 'inline': summary = InlineSummary(network=self) else: raise ValueError("`{}` is unknown summary type" "".format(summary_type)) iterepochs = create_training_epochs_iterator(self, epochs, epsilon) show_epoch = parse_show_epoch_property(self, epochs, epsilon) training.show_epoch = show_epoch # Storring attributes and methods in local variables we prevent # useless __getattr__ call a lot of times in each loop. # This variables speed up loop in case on huge amount of # iterations. training_errors = self.errors validation_errors = self.validation_errors shuffle_data = self.shuffle_data train_epoch = self.train_epoch epoch_end_signal = self.epoch_end_signal train_end_signal = self.train_end_signal on_epoch_start_update = self.on_epoch_start_update is_first_iteration = True can_compute_validation_error = (input_test is not None) last_epoch_shown = 0 with logs.disable_user_input(): for epoch in iterepochs: validation_error = np.nan epoch_start_time = time.time() on_epoch_start_update(epoch) if shuffle_data: input_train, target_train = shuffle(input_train, target_train) try: train_error = train_epoch(input_train, target_train) if can_compute_validation_error: validation_error = self.prediction_error(input_test, target_test) training_errors.append(train_error) validation_errors.append(validation_error) epoch_finish_time = time.time() training.epoch_time = epoch_finish_time - epoch_start_time if epoch % training.show_epoch == 0 or is_first_iteration: summary.show_last() last_epoch_shown = epoch if epoch_end_signal is not None: epoch_end_signal(self) is_first_iteration = False except StopNetworkTraining as err: # TODO: This notification breaks table view in terminal. # I need to show it in a different way. logs.message("TRAIN", "Epoch #{} stopped. {}" "".format(epoch, str(err))) break if epoch != last_epoch_shown: summary.show_last() if train_end_signal is not None: train_end_signal(self) summary.finish() logs.newline() logs.message("TRAIN", "Trainig finished")
class A(Configurable): property_a = Property()
class C(B): property_c = Property()
class LSTM(BaseRNNLayer): """ Long Short Term Memory (LSTM) Layer. Parameters ---------- {BaseRNNLayer.size} weights : dict or Initializer Weight parameters for different gates. Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`. - In case if application requires the same initialization method for all weights, then it's possible to specify initialization method that would be automaticaly applied to all weight parameters in the LSTM layer. .. code-block:: python layers.LSTM(2, weights=init.Normal(0.1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( weight_in_to_ingate=init.XavierUniform(), weight_hid_to_ingate=init.XavierUniform(), weight_cell_to_ingate=init.XavierUniform(), weight_in_to_forgetgate=init.XavierUniform(), weight_hid_to_forgetgate=init.XavierUniform(), weight_cell_to_forgetgate=init.XavierUniform(), weight_in_to_outgate=init.XavierUniform(), weight_hid_to_outgate=init.XavierUniform(), weight_cell_to_outgate=init.XavierUniform(), weight_in_to_cell=init.XavierUniform(), weight_hid_to_cell=init.XavierUniform(), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(weight_in_to_ingate=init.Normal(0.1)) Other parameters like ``weight_cell_to_outgate`` will be equal to their default values. biases : dict or Initializer Bias parameters for different gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. - In case if application requires the same initialization method for all biases, then it's possible to specify initialization method that would be automaticaly applied to all bias parameters in the LSTM layer. .. code-block:: python layers.LSTM(2, biases=init.Constant(1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( bias_ingate=init.Constant(0), bias_forgetgate=init.Constant(0), bias_cell=init.Constant(0), bias_outgate=init.Constant(0), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(bias_ingate=init.Constant(1)) Other parameters like ``bias_cell`` will be equal to their default values. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import theano.tensor as T dict( ingate=T.nnet.sigmoid, forgetgate=T.nnet.sigmoid, outgate=T.nnet.sigmoid, cell=T.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(ingate=T.tanh) Other parameters like ``forgetgate`` or ``outgate`` will be equal to their default values. learn_init : bool If ``True``, make ``cell_init`` and ``hid_init`` trainable variables. Defaults to ``False``. cell_init : array-like, Theano variable, scalar or Initializer Initializer for initial cell state (:math:`c_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. hid_init : array-like, Theano variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False`` {BaseRNNLayer.only_return_final} precompute_input : bool if ``True``, precompute ``input_to_hid`` before iterating through the sequence. This can result in a speed up at the expense of an increase in memory usage. Defaults to ``True``. peepholes : bool If ``True``, the LSTM uses peephole connections. When ``False``, cell parameters are ignored. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. gradient_clipping : flaot or int If nonzero, the gradient messages are clipped to the given value during the backward pass. Defaults to ``0``. n_gradient_steps : int Number of timesteps to include in the backpropagated gradient. If ``-1``, backpropagate through the entire sequence. Defaults to ``-1``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.LSTM(20), layers.Sigmoid(1), ] ) """ weights = MultiParameterProperty( default=dict( weight_in_to_ingate=init.XavierUniform(), weight_hid_to_ingate=init.XavierUniform(), weight_cell_to_ingate=init.XavierUniform(), weight_in_to_forgetgate=init.XavierUniform(), weight_hid_to_forgetgate=init.XavierUniform(), weight_cell_to_forgetgate=init.XavierUniform(), weight_in_to_outgate=init.XavierUniform(), weight_hid_to_outgate=init.XavierUniform(), weight_cell_to_outgate=init.XavierUniform(), weight_in_to_cell=init.XavierUniform(), weight_hid_to_cell=init.XavierUniform(), )) biases = MultiParameterProperty( default=dict( bias_ingate=init.Constant(0), bias_forgetgate=init.Constant(0), bias_cell=init.Constant(0), bias_outgate=init.Constant(0), )) activation_functions = MultiCallableProperty( default=dict( ingate=T.nnet.sigmoid, forgetgate=T.nnet.sigmoid, outgate=T.nnet.sigmoid, cell=T.tanh, )) learn_init = Property(default=False, expected_type=bool) cell_init = ParameterProperty(default=init.Constant(0)) hid_init = ParameterProperty(default=init.Constant(0)) unroll_scan = Property(default=False, expected_type=bool) backwards = Property(default=False, expected_type=bool) precompute_input = Property(default=True, expected_type=bool) peepholes = Property(default=False, expected_type=bool) n_gradient_steps = IntProperty(default=-1) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(LSTM, self).initialize() n_inputs = np.prod(self.input_shape[1:]) weights = self.weights biases = self.biases # Input gate parameters self.weight_in_to_ingate = self.add_parameter( value=weights.weight_in_to_ingate, name='weight_in_to_ingate', shape=(n_inputs, self.size)) self.weight_hid_to_ingate = self.add_parameter( value=weights.weight_hid_to_ingate, name='weight_hid_to_ingate', shape=(self.size, self.size)) self.bias_ingate = self.add_parameter( value=biases.bias_ingate, name='bias_ingate', shape=(self.size,)) # Forget gate parameters self.weight_in_to_forgetgate = self.add_parameter( value=weights.weight_in_to_forgetgate, name='weight_in_to_forgetgate', shape=(n_inputs, self.size)) self.weight_hid_to_forgetgate = self.add_parameter( value=weights.weight_hid_to_forgetgate, name='weight_hid_to_forgetgate', shape=(self.size, self.size)) self.bias_forgetgate = self.add_parameter( value=biases.bias_forgetgate, name='bias_forgetgate', shape=(self.size,)) # Cell parameters self.weight_in_to_cell = self.add_parameter( value=weights.weight_in_to_cell, name='weight_in_to_cell', shape=(n_inputs, self.size)) self.weight_hid_to_cell = self.add_parameter( value=weights.weight_hid_to_cell, name='weight_hid_to_cell', shape=(self.size, self.size)) self.bias_cell = self.add_parameter( value=biases.bias_cell, name='bias_cell', shape=(self.size,)) # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.weight_cell_to_ingate = self.add_parameter( value=weights.weight_cell_to_ingate, name='weight_cell_to_ingate', shape=(self.size,)) self.weight_cell_to_forgetgate = self.add_parameter( value=weights.weight_cell_to_forgetgate, name='weight_cell_to_forgetgate', shape=(self.size,)) self.weight_cell_to_outgate = self.add_parameter( value=weights.weight_cell_to_outgate, name='weight_cell_to_outgate', shape=(self.size,)) # Output gate parameters self.weight_in_to_outgate = self.add_parameter( value=weights.weight_in_to_outgate, name='weight_in_to_outgate', shape=(n_inputs, self.size)) self.weight_hid_to_outgate = self.add_parameter( value=weights.weight_hid_to_outgate, name='weight_hid_to_outgate', shape=(self.size, self.size)) self.bias_outgate = self.add_parameter( value=biases.bias_outgate, name='bias_outgate', shape=(self.size,)) # Initialization parameters self.add_parameter(value=self.cell_init, shape=(1, self.size), name="cell_init", trainable=self.learn_init) self.add_parameter(value=self.hid_init, shape=(1, self.size), name="hid_init", trainable=self.learn_init) def output(self, input_value): # Treat all dimensions after the second as flattened # feature dimensions if input_value.ndim > 3: input_value = T.flatten(input_value, 3) # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = input_value.dimshuffle(1, 0, 2) seq_len, n_batch, _ = input_value.shape # Stack input weight matrices into a (num_inputs, 4 * num_units) # matrix, which speeds up computation weight_in_stacked = T.concatenate([ self.weight_in_to_ingate, self.weight_in_to_forgetgate, self.weight_in_to_cell, self.weight_in_to_outgate], axis=1) # Same for hidden weight matrices weight_hid_stacked = T.concatenate([ self.weight_hid_to_ingate, self.weight_hid_to_forgetgate, self.weight_hid_to_cell, self.weight_hid_to_outgate], axis=1) # Stack biases into a (4 * num_units) vector bias_stacked = T.concatenate([ self.bias_ingate, self.bias_forgetgate, self.bias_cell, self.bias_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # weight_in_stacked is (n_features, 4 * num_units). # Input: (n_time_steps, n_batch, 4 * num_units). input_value = T.dot(input_value, weight_in_stacked) + bias_stacked # When theano.scan calls step, input_n will be # (n_batch, 4 * num_units). We define a slicing function # that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.size:(n + 1) * self.size] def one_lstm_step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, weight_in_stacked) + bias_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, weight_hid_stacked) # Clip gradients if self.gradient_clipping: gates = theano.gradient.grad_clip( gates, -self.gradient_clipping, self.gradient_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.weight_cell_to_ingate forgetgate += cell_previous * self.weight_cell_to_forgetgate # Apply nonlinearities ingate = self.activation_functions.ingate(ingate) forgetgate = self.activation_functions.forgetgate(forgetgate) cell_input = self.activation_functions.cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.weight_cell_to_outgate outgate = self.activation_functions.outgate(outgate) # Compute new hidden unit activation hid = outgate * T.tanh(cell) return [cell, hid] ones = T.ones((n_batch, 1)) cell_init = T.dot(ones, self.cell_init) hid_init = T.dot(ones, self.hid_init) non_sequences = [weight_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_sequences += [weight_in_stacked, bias_stacked] # The "peephole" weight matrices are only used # when self.peepholes=True if self.peepholes: non_sequences += [self.weight_cell_to_ingate, self.weight_cell_to_forgetgate, self.weight_cell_to_outgate] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer n_time_steps = self.input_shape[0] # Explicitly unroll the recurrence instead of using scan _, hid_out = unroll_scan( fn=one_lstm_step, sequences=[input_value], outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_sequences, n_steps=n_time_steps) else: (_, hid_out), _ = theano.scan( fn=one_lstm_step, sequences=input_value, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.n_gradient_steps, non_sequences=non_sequences, strict=True) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
class A(Configurable): correct_property = Property()
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} weights : dict or Initializer Weight parameters for different gates. Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`. - In case if application requires the same initialization method for all weights, then it's possible to specify initialization method that would be automaticaly applied to all weight parameters in the GRU layer. .. code-block:: python layers.GRU(2, weights=init.Normal(0.1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(weight_in_to_updategate=init.Normal(0.1)) Other parameters like ``weight_in_to_resetgate`` will be equal to their default values. biases : dict or Initializer Bias parameters for different gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. - In case if application requires the same initialization method for all biases, then it's possible to specify initialization method that would be automaticaly applied to all bias parameters in the GRU layer. .. code-block:: python layers.GRU(2, biases=init.Constant(1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(bias_resetgate=init.Constant(1)) Other parameters like ``bias_updategate`` will be equal to their default values. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import theano.tensor as T dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=T.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hid_init`` trainable variable. Defaults to ``False``. hid_init : array-like, Theano variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. precompute_input : bool if ``True``, precompute ``input_to_hid`` before iterating through the sequence. This can result in a speed up at the expense of an increase in memory usage. Defaults to ``True``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ weights = MultiParameterProperty( default=dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), )) biases = MultiParameterProperty( default=dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), )) activation_functions = MultiCallableProperty( default=dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, )) learn_init = Property(default=False, expected_type=bool) hid_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) precompute_input = Property(default=True, expected_type=bool) n_gradient_steps = IntProperty(default=-1) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) weights = self.weights biases = self.biases # Update gate parameters self.weight_in_to_updategate = self.add_parameter( value=weights.weight_in_to_updategate, name='weight_in_to_updategate', shape=(n_inputs, self.size)) self.weight_hid_to_updategate = self.add_parameter( value=weights.weight_hid_to_updategate, name='weight_hid_to_updategate', shape=(self.size, self.size)) self.bias_updategate = self.add_parameter( value=biases.bias_updategate, name='bias_updategate', shape=(self.size,)) # Reset gate parameters self.weight_in_to_resetgate = self.add_parameter( value=weights.weight_in_to_resetgate, name='weight_in_to_resetgate', shape=(n_inputs, self.size)) self.weight_hid_to_resetgate = self.add_parameter( value=weights.weight_hid_to_resetgate, name='weight_hid_to_resetgate', shape=(self.size, self.size)) self.bias_resetgate = self.add_parameter( value=biases.bias_resetgate, name='bias_forgetgate', shape=(self.size,)) # Hidden update gate parameters self.weight_in_to_hidden_update = self.add_parameter( value=weights.weight_in_to_hidden_update, name='weight_in_to_hidden_update', shape=(n_inputs, self.size)) self.weight_hid_to_hidden_update = self.add_parameter( value=weights.weight_hid_to_hidden_update, name='weight_hid_to_hidden_update', shape=(self.size, self.size)) self.bias_hidden_update = self.add_parameter( value=biases.bias_hidden_update, name='bias_hidden_update', shape=(self.size,)) self.add_parameter(value=self.hid_init, shape=(1, self.size), name="hid_init", trainable=self.learn_init) def output(self, input_value): # Treat all dimensions after the second as flattened # feature dimensions if input_value.ndim > 3: input_value = T.flatten(input_value, 3) # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = input_value.dimshuffle(1, 0, 2) seq_len, n_batch, _ = input_value.shape # Stack input weight matrices into a (num_inputs, 3 * num_units) # matrix, which speeds up computation weight_in_stacked = T.concatenate([ self.weight_in_to_updategate, self.weight_in_to_resetgate, self.weight_in_to_hidden_update], axis=1) # Same for hidden weight matrices weight_hid_stacked = T.concatenate([ self.weight_hid_to_updategate, self.weight_hid_to_resetgate, self.weight_hid_to_hidden_update], axis=1) # Stack biases into a (3 * num_units) vector bias_stacked = T.concatenate([ self.bias_updategate, self.bias_resetgate, self.bias_hidden_update], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # weight_in_stacked is (n_features, 3 * num_units). # Input: (n_time_steps, n_batch, 3 * num_units). input_value = T.dot(input_value, weight_in_stacked) + bias_stacked # When theano.scan calls step, input_n will be # (n_batch, 3 * num_units). We define a slicing function # that extract the input to each GRU gate def slice_w(x, n): s = x[:, n * self.size:(n + 1) * self.size] if self.size == 1: s = T.addbroadcast(s, 1) # Theano cannot infer this by itself return s # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(input_n, hid_previous, *args): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, weight_hid_stacked) if self.gradient_clipping: input_n = theano.gradient.grad_clip( input_n, -self.gradient_clipping, self.gradient_clipping) hid_input = theano.gradient.grad_clip( hid_input, -self.gradient_clipping, self.gradient_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, # and W_{xc}x_t + b_c input_n = T.dot(input_n, weight_in_stacked) + bias_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) resetgate = self.activation_functions.resetgate(resetgate) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) updategate = self.activation_functions.updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.gradient_clipping: hidden_update = theano.gradient.grad_clip( hidden_update, -self.gradient_clipping, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update return hid hid_init = T.dot(T.ones((n_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_sequences = [weight_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_sequences += [weight_in_stacked, bias_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer n_time_steps = self.input_shape[0] # Explicitly unroll the recurrence instead of using scan hid_out, = unroll_scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, n_steps=n_time_steps) else: # Scan op iterates over first dimension of input and # repeatedly applies the step function hid_out, _ = theano.scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, truncate_gradient=self.n_gradient_steps, strict=True) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
class B(Configurable): property_b = Property()