Exemplo n.º 1
0
 def __init__(self, params):
     self.params = params
     cell = GRUCell(units=self.params['rnn_size'])
     self.rnn = RNN(cell, return_state=True, return_sequences=True)
     self.bn = BatchNormalization()
     cell2 = GRUCell(units=self.params['rnn_size2'])
     self.rnn2 = RNN(cell2, return_state=True, return_sequences=True)
     self.dense = Dense(units=self.params['dense_size'])
Exemplo n.º 2
0
            def __init__(self):
                super().__init__()

                # TODO(lemmatizer_noattn): Define
                # - source_embeddings as a masked embedding layer of source chars into args.cle_dim dimensions
                self.source_embeddings = Embedding(num_source_chars, args.cle_dim, mask_zero=True)

                # TODO: Define
                # - source_rnn as a bidirectional GRU with args.rnn_dim units, returning _whole sequences_, summing opposite directions
                self.source_rnn = Bidirectional(GRU(args.rnn_dim, return_sequences=True), merge_mode='sum')

                # TODO(lemmatizer_noattn): Define
                # - target_embedding as an unmasked embedding layer of target chars into args.cle_dim dimensions
                # - target_rnn_cell as a GRUCell with args.rnn_dim units
                # - target_output_layer as a Dense layer into `num_target_chars`
                self.target_embeddings = Embedding(num_target_chars, args.cle_dim)
                self.target_rnn_cell = GRUCell(args.rnn_dim)
                self.target_output_layer = Dense(num_target_chars, activation=None)

                # TODO: Define
                # - attention_source_layer as a Dense layer with args.rnn_dim outputs
                # - attention_state_layer as a Dense layer with args.rnn_dim outputs
                # - attention_weight_layer as a Dense layer with 1 output
                self.attention_source_layer = Dense(args.rnn_dim)
                self.attention_state_layer = Dense(args.rnn_dim)
                self.attention_weight_layer = Dense(1)
Exemplo n.º 3
0
def make_network():
    inputs = Input(shape=(image_size, image_size, 1), name='input_data')
    state = Input(shape=(hidden_size, ), name='state')
    x = Conv2D(32,
               kernel_size=(3, 3),
               strides=(2, 2),
               padding='same',
               activation='relu')(inputs)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Conv2D(32, kernel_size=(3, 3), strides=(2, 2), activation='relu')(x)
    # x = MaxPooling2D(pool_size=(2, 2))(x)
    # x = Conv2D(32, kernel_size=(3, 3), strides=(2, 2), activation='relu')(x)

    # print(x.shape)
    x = Flatten()(x)
    x, new_state = GRUCell(hidden_size)(x, state)

    action_out = Dense(7, activation='softmax', name='action')(x)
    # x = Concatenate()([x, action_out])
    value_out = Dense(1, activation='linear', name='reward')(x)

    model = Model(inputs=[inputs, state],
                  outputs=[action_out, value_out, new_state])
    # model.compile(optimizer='adam', loss=[loss_function, 'mse'])
    # model.summary()
    return model
Exemplo n.º 4
0
 def __init__(self, lr=1e-4, dropout_rate=0.2, units=300, beam_width=12, vocab_size=9000):
     super().__init__()
     self.embedding = tf.Variable(np.load('../data/embedding.npy'),
                                  dtype=tf.float32,
                                  name='pretrained_embedding',
                                  trainable=False)
     self.encoder = Encoder(units=units, dropout_rate=dropout_rate)
     self.attention_mechanism = BahdanauAttention(units=units)
     self.decoder_cell = AttentionWrapper(
         GRUCell(units),
         self.attention_mechanism,
         attention_layer_size=units)
     self.projected_layer = ProjectedLayer(self.embed.embedding)
     self.sampler = tfa.seq2seq.sampler.TrainingSampler()
     self.decoder = BasicDecoder(
         self.decoder_cell,
         self.sampler,
         output_layer=self.projected_layer)
     self.beam_search = BeamSearchDecoder(
         self.decoder_cell,
         beam_width=beam_width,
         embedding_fn=lambda x: tf.nn.embedding_lookup(self.embedding, x),
         output_layer=self.projected_layer)
     self.vocab_size = vocab_size
     self.optimizer = Adam(lr)
     self.accuracy = tf.keras.metrics.Accuracy()
     self.mean = tf.keras.metrics.Mean()
     self.decay_lr = tf.optimizers.schedules.ExponentialDecay(lr, 1000, 0.95)
     self.logger = logging.getLogger('tensorflow')
     self.logger.setLevel(logging.INFO)
Exemplo n.º 5
0
 def __init__(self, params):
     self.params = params
     self.rnn_cell = GRUCell(self.params['rnn_size'])
     self.dense = Dense(units=1)
     self.attention = Attention(hidden_size=32,
                                num_heads=2,
                                attention_dropout=0.8)
Exemplo n.º 6
0
    def __init__(self, encoder=None, vocab_size=1, embedding_size=32, hidden_size=64):
        """Initialization method.

        Args:
            encoder (IntegerEncoder): An index to vocabulary encoder.
            vocab_size (int): The size of the vocabulary.
            embedding_size (int): The size of the embedding layer.
            hidden_size (int): The amount of hidden neurons.

        """

        logger.info('Overriding class: Generator -> GRUGenerator.')

        # Overrides its parent class with any custom arguments if needed
        super(GRUGenerator, self).__init__(name='G_gru')

        # Creates a property for holding the used encoder
        self.encoder = encoder

        # Creates an embedding layer
        self.embedding = Embedding(
            vocab_size, embedding_size, name='embedding')

        # Creates a GRU cell
        self.cell = GRUCell(hidden_size, name='gru')

        # Creates the RNN loop itself
        self.rnn = RNN(self.cell, name='rnn_layer',
                       return_sequences=True,
                       stateful=True)

        # Creates the linear (Dense) layer
        self.linear = Dense(vocab_size, name='out')

        logger.info('Class overrided.')
Exemplo n.º 7
0
 def __init__(self, params):
     self.params = params
     self.rnn_cell = GRUCell(self.params['rnn_size'])
     self.rnn = RNN(self.rnn_cell, return_state=True, return_sequences=True)
     self.dense = Dense(units=1)
     self.attention = Attention(hidden_size=32,
                                num_heads=2,
                                attention_dropout=0.8)
Exemplo n.º 8
0
    def __init__(self, units, dropout, output_steps, output_size):
        super().__init__()
        self.output_steps = output_steps
        self.units = units
        self.lstm_cell = GRUCell(units, dropout=dropout)

        self.lstm_rnn = RNN(self.lstm_cell, return_state=True)
        self.dense = Dense(output_size, activation="softmax")
Exemplo n.º 9
0
 def __init__(self, params):
     self.params = params
     self.rnn_cell = GRUCell(self.params['hidden_size'])
     self.rnn = RNN(self.rnn_cell, return_state=True, return_sequences=True)
     self.dense = Dense(units=self.params['inputdim'])
     self.attention = Attention(
         hidden_size=self.params['hidden_size'],
         num_heads=self.params['num_heads'],
         attention_dropout=self.params['attention_dropout'])
Exemplo n.º 10
0
 def __init__(self, num_classes, hparams):
     super().__init__()
     self.state_size = hparams.hidden_dim
     self.hidden_dim = hparams.hidden_dim
     self.decoder_rnn = GRUCell(hparams.hidden_dim,
                                recurrent_initializer='glorot_uniform')
     self.dense_layer = Dense(num_classes)
     self.attention = LocationSensitiveAttention(hparams)
     self.prev_context, self.prev_align = None, None
     self.alignments = []
Exemplo n.º 11
0
    def __init__(self, K, conv_dim):
        super(CBHG, self).__init__()
        self.K = K
        self.conv_bank = []
        for k in range(1, self.K + 1):
            x = Conv1D(128, kernel_size=k, padding='same')
            self.conv_bank.append(x)

        self.bn = BatchNormalization()
        self.conv1 = Conv1D(conv_dim[0], kernel_size=3, padding='same')
        self.bn1 = BatchNormalization()
        self.conv2 = Conv1D(conv_dim[1], kernel_size=3, padding='same')
        self.bn2 = BatchNormalization()

        self.proj = Dense(128)
        self.dense1 = Dense(128)
        self.dense2 = Dense(128,
                            bias_initializer=tf.constant_initializer(-1.0))

        self.gru_fw = GRUCell(128)
        self.gru_bw = GRUCell(128)
Exemplo n.º 12
0
    def init_layers(self,
                    param_units=10,
                    tensor_units=5,
                    global_units=5,
                    init_lr=(1e-6, 1e-2),
                    timescales=1,
                    epsilon=1e-10,
                    momentum_decay_bias_init=logit(0.9),
                    variance_decay_bias_init=logit(0.999),
                    use_gradient_shortcut=True,
                    **kwargs):
        """Initialize layers."""
        assert (init_lr[0] > 0 and init_lr[1] > 0 and epsilon > 0)
        self.timescales = timescales
        self.init_lr = init_lr
        self.epsilon = epsilon

        # Parameter, Tensor, & Global RNNs (may have different size)
        self.param_rnn = GRUCell(param_units, name="param_rnn", **kwargs)
        self.tensor_rnn = GRUCell(tensor_units, name="tensor_rnn", **kwargs)
        self.global_rnn = GRUCell(global_units, name="global_rnn", **kwargs)

        # Parameter change
        self.d_theta = Dense(1,
                             input_shape=(param_units, ),
                             name="d_theta",
                             kernel_initializer="zeros")
        # Learning rate change
        self.delta_nu = Dense(1,
                              input_shape=(param_units, ),
                              name="delta_nu",
                              kernel_initializer="zeros")
        # Momentum decay rate
        self.beta_g = Dense(1,
                            input_shape=(param_units, ),
                            kernel_initializer="zeros",
                            bias_initializer=tf.constant_initializer(
                                value=momentum_decay_bias_init),
                            activation="sigmoid",
                            name="beta_g")
        # Variance/scale decay rate
        self.beta_lambda = Dense(1,
                                 input_shape=(param_units, ),
                                 kernel_initializer="zeros",
                                 bias_initializer=tf.constant_initializer(
                                     value=variance_decay_bias_init),
                                 activation="sigmoid",
                                 name="beta_lambda")
        # Momentum shortcut
        if use_gradient_shortcut:
            self.gradient_shortcut = Dense(1,
                                           input_shape=(timescales, ),
                                           name="gradient_shortcut",
                                           kernel_initializer="zeros")
        else:
            self.gradient_shortcut = None

        # Gamma parameter
        # Stored as a logit - the actual gamma used will be sigmoid(gamma)
        self.gamma = tf.Variable(tf.zeros(()), trainable=True, name="gamma")
Exemplo n.º 13
0
 def __init__(self, rnn_utils=32, rnn_layer_num=2, dense_num=3, embedding_size=32):
     super(LSPD, self).__init__()
     self.input_dim = None
     self.time_step = 0
     self.sample_size = 0
     self.rnn_utils = rnn_utils
     self.gru_cell_left = []
     self.gru_cell_right = []
     self.batchnormals = []
     self.denses = []
     self.embedding = None
     self.embedding_size = embedding_size
     self.denses_batchnormal = []
     self.rnn_layer_num = rnn_layer_num
     for i in range(rnn_layer_num):
         self.gru_cell_left.append(GRUCell(units=rnn_utils, activation='relu'))
         self.gru_cell_right.append(GRUCell(units=rnn_utils, activation='relu'))
         self.batchnormals.append(BatchNormalization())
     for i in range(dense_num-1):
         self.denses.append(Dense(units=32, activation='relu'))
         self.denses_batchnormal.append(BatchNormalization())
     self.dense = Dense(units=11, activation='softmax')
Exemplo n.º 14
0
def test_gru_cell():
    n_inputs = 3
    n_units = 4
    batch_size = 1
    inputs = tx.Input(n_units=n_inputs)

    gru0 = tx.GRUCell(inputs,
                      n_units,
                      activation=tf.tanh,
                      gate_activation=tf.sigmoid)

    # applies gate after matrix multiplication and uses
    # recurrent biases, this makes it compatible with cuDNN
    # implementation
    gru1 = GRUCell(n_units,
                   activation='tanh',
                   recurrent_activation='sigmoid',
                   reset_after=False,
                   implementation=1,
                   use_bias=True)

    assert not hasattr(gru1, "kernel")

    state0 = [s() for s in gru0.previous_state]
    #  get_initial_state from keras returns either a tuple or a single
    #  state see test_rnn_cell, but the __call__ API requires an iterable
    state1 = gru1.get_initial_state(inputs, batch_size=1)

    assert tx.tensor_equal(state1, state0[0])

    inputs.value = tf.ones([batch_size, n_inputs])

    res1 = gru1(inputs, state0)
    res1_ = gru1(inputs, state0)

    for r1, r2 in zip(res1, res1_):
        assert tx.tensor_equal(r1, r2)

    # the only difference is that keras kernels are fused together
    kernel = tf.concat([w.weights.value() for w in gru0.layer_state.w],
                       axis=-1)
    recurrent_kernel = tf.concat([u.weights for u in gru0.layer_state.u],
                                 axis=-1)
    bias = tf.concat([w.bias for w in gru0.layer_state.w], axis=-1)

    assert tx.same_shape(kernel, gru1.kernel)
    assert tx.same_shape(recurrent_kernel, gru1.recurrent_kernel)
    assert tx.same_shape(bias, gru1.bias)

    gru1.kernel = kernel
    gru1.recurrent_kernel = recurrent_kernel
    gru1.bias = bias

    res2 = gru1(inputs, state0)
    for i in range(len(res1)):
        assert not tx.tensor_equal(res1[i], res2[i])
    res0 = gru0()
    # res0_ = gru0.state[0]()
    assert tx.tensor_equal(res0, res2[0])
Exemplo n.º 15
0
    def __init__(self, name='gru', **kwargs):
        super(GRU, self).__init__(name=name, **kwargs)

        self.input_layer = InputLayer((28, 28, 1),
                                      name='{}_input'.format(name))

        self.reshape = Reshape([28, 28], name='{}_reshape'.format(
            name))  # sequence of 28 elements of size 28
        self.rnn = RNN(input_shape=[28, 28],
                       cell=GRUCell(units=128, name='{}_rnn_gru'.format(name)),
                       name='{}_rnn'.format(name))

        self.dense = Dense(10,
                           activation=tf.nn.softmax,
                           name='{}_output'.format(name))
Exemplo n.º 16
0
            def __init__(self):
                super().__init__()

                self.source_embeddings = Embedding(num_source_chars,
                                                   args.cle_dim,
                                                   mask_zero=True)
                self.source_rnn = Bidirectional(GRU(args.rnn_dim,
                                                    return_sequences=True),
                                                merge_mode='sum')

                self.target_embeddings = Embedding(num_target_chars,
                                                   args.cle_dim)
                self.target_rnn_cell = GRUCell(args.rnn_dim)
                self.target_output_layer = Dense(num_target_chars,
                                                 activation=None)

                self.attention_source_layer = Dense(args.rnn_dim)
                self.attention_state_layer = Dense(args.rnn_dim)
                self.attention_weight_layer = Dense(1)
Exemplo n.º 17
0
 def build(self, input_shape):
     assert len(input_shape) >= 2
     self.kernel = self.add_weight(
         name="kernel",
         shape=(self.n_layers, self.channels, self.channels),
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint,
     )
     self.rnn = GRUCell(
         self.channels,
         kernel_initializer=self.kernel_initializer,
         bias_initializer=self.bias_initializer,
         kernel_regularizer=self.kernel_regularizer,
         bias_regularizer=self.bias_regularizer,
         activity_regularizer=self.activity_regularizer,
         kernel_constraint=self.kernel_constraint,
         bias_constraint=self.bias_constraint,
         use_bias=self.use_bias,
     )
     self.built = True
Exemplo n.º 18
0
            def __init__(self):
                super().__init__()

                # TODO: Define
                # - source_embeddings as a masked embedding layer of source chars into args.cle_dim dimensions
                # - source_rnn as a bidirectional GRU with args.rnn_dim units, returning only the last state, summing opposite directions
                self.source_embeddings = Embedding(num_source_chars,
                                                   args.cle_dim,
                                                   mask_zero=True)
                self.source_rnn = Bidirectional(GRU(args.rnn_dim,
                                                    return_sequences=False),
                                                merge_mode='sum')

                # - target_embedding as an unmasked embedding layer of target chars into args.cle_dim dimensions
                # - target_rnn_cell as a GRUCell with args.rnn_dim units
                # - target_output_layer as a Dense layer into `num_target_chars`
                self.target_embeddings = Embedding(num_target_chars,
                                                   args.cle_dim)
                self.target_rnn_cell = GRUCell(args.rnn_dim)
                self.target_output_layer = Dense(num_target_chars,
                                                 activation=None)
Exemplo n.º 19
0
    def __init__(
        self,
        vocab_size: Optional[int] = 1,
        embedding_size: Optional[int] = 32,
        hidden_size: Optional[int] = 64,
    ):
        """Initialization method.

        Args:
            vocab_size: Vocabulary size.
            embedding_size: Embedding layer units.
            hidden_size: Hidden layer units.

        """

        logger.info("Overriding class: Base -> GRU.")

        super(GRU, self).__init__(name="gru")

        # Embedding layer
        self.embedding = Embedding(vocab_size,
                                   embedding_size,
                                   name="embedding")

        # GRU cell
        self.cell = GRUCell(hidden_size, name="gru_cell")

        # RNN layer
        self.rnn = RNN(self.cell, name="rnn_layer", return_sequences=True)

        # Linear (dense) layer
        self.fc = Dense(vocab_size, name="out")

        logger.info("Class overrided.")
        logger.debug(
            "Embedding: %d | Hidden: %d | Output: %d.",
            embedding_size,
            hidden_size,
            vocab_size,
        )
Exemplo n.º 20
0
    def build(self, input_shape):
        assert len(input_shape) >= 2
        F = input_shape[0][1]
        if F > self.channels:
            raise ValueError('channels ({}) must be greater than the number of '
                             'input features ({}).'.format(self.channels, F))

        self.kernel = self.add_weight(name='kernel',
                                      shape=(self.n_layers, self.channels, self.channels),
                                      initializer=self.kernel_initializer,
                                      regularizer=self.kernel_regularizer,
                                      constraint=self.kernel_constraint)
        self.rnn = GRUCell(self.channels,
                           kernel_initializer=self.kernel_initializer,
                           bias_initializer=self.bias_initializer,
                           kernel_regularizer=self.kernel_regularizer,
                           bias_regularizer=self.bias_regularizer,
                           activity_regularizer=self.activity_regularizer,
                           kernel_constraint=self.kernel_constraint,
                           bias_constraint=self.bias_constraint,
                           use_bias=self.use_bias)
        self.built = True
Exemplo n.º 21
0
 def __init__(self, params):
     self.params = params
     self.rnn_cell = GRUCell(units=self.params['hidden_size'])
     self.rnn = RNN(self.rnn_cell, return_state=True, return_sequences=True)
     self.dense = Dense(units=self.params['inputdim'])
Exemplo n.º 22
0
 def __init__(self, params):
     self.params = params
     cell = GRUCell(units=self.params['rnn_size'])
     self.rnn = RNN(cell, return_state=True, return_sequences=True)
     self.dense = Dense(units=1)
 def gru_cell(self):
     return GRUCell(num_units=self.hidden_dim)
Exemplo n.º 24
0
class ScaleHierarchicalOptimizer(BaseHierarchicalPolicy):
    """Hierarchical optimizer.

    Described in
    "Learned Optimizers that Scale and Generalize" (Wichrowska et. al, 2017)

    Keyword Args
    ------------
    param_units : int
        Number of hidden units for parameter RNN.
    tensor_units : int
        Number of hidden units for tensor RNN.
    global_units : int
        Number of hidden units for global RNN.
    init_lr : float[2]
        Learning rate initialization range. Actual learning rate values are
        IID exp(unif(log(init_lr))).
    timescales : int
        Number of timescales to compute momentum for.
    epsilon : float
        Denominator epsilon for normalization operation in case input is 0.
    momentum_decay_bias_init : float
        Constant initializer for EMA momentum decay rate logit beta_g. Should
        correspond to beta_1 in an Adam teacher.
    variance_decay_bias_init : float
        Constant initializer for EMA variance decay rate logit beta_lambda.
        Should correspond to beta_2 in an Adam teacher.
    use_gradient_shortcut : bool
        Use shortcut connection adding linear transformation of momentum at
        various timescales to direction output?
    name : str
        Name of optimizer network
    **kwargs : dict
        Passed onto tf.keras.layers.GRUCell
    """

    default_name = "ScaleHierarchicalOptimizer"

    def init_layers(self,
                    param_units=10,
                    tensor_units=5,
                    global_units=5,
                    init_lr=(1e-6, 1e-2),
                    timescales=1,
                    epsilon=1e-10,
                    momentum_decay_bias_init=logit(0.9),
                    variance_decay_bias_init=logit(0.999),
                    use_gradient_shortcut=True,
                    **kwargs):
        """Initialize layers."""
        assert (init_lr[0] > 0 and init_lr[1] > 0 and epsilon > 0)
        self.timescales = timescales
        self.init_lr = init_lr
        self.epsilon = epsilon

        # Parameter, Tensor, & Global RNNs (may have different size)
        self.param_rnn = GRUCell(param_units, name="param_rnn", **kwargs)
        self.tensor_rnn = GRUCell(tensor_units, name="tensor_rnn", **kwargs)
        self.global_rnn = GRUCell(global_units, name="global_rnn", **kwargs)

        # Parameter change
        self.d_theta = Dense(1,
                             input_shape=(param_units, ),
                             name="d_theta",
                             kernel_initializer="zeros")
        # Learning rate change
        self.delta_nu = Dense(1,
                              input_shape=(param_units, ),
                              name="delta_nu",
                              kernel_initializer="zeros")
        # Momentum decay rate
        self.beta_g = Dense(1,
                            input_shape=(param_units, ),
                            kernel_initializer="zeros",
                            bias_initializer=tf.constant_initializer(
                                value=momentum_decay_bias_init),
                            activation="sigmoid",
                            name="beta_g")
        # Variance/scale decay rate
        self.beta_lambda = Dense(1,
                                 input_shape=(param_units, ),
                                 kernel_initializer="zeros",
                                 bias_initializer=tf.constant_initializer(
                                     value=variance_decay_bias_init),
                                 activation="sigmoid",
                                 name="beta_lambda")
        # Momentum shortcut
        if use_gradient_shortcut:
            self.gradient_shortcut = Dense(1,
                                           input_shape=(timescales, ),
                                           name="gradient_shortcut",
                                           kernel_initializer="zeros")
        else:
            self.gradient_shortcut = None

        # Gamma parameter
        # Stored as a logit - the actual gamma used will be sigmoid(gamma)
        self.gamma = tf.Variable(tf.zeros(()), trainable=True, name="gamma")

    def call_global(self, states, global_state, training=False):
        """Equation 12.

        Global RNN. Inputs are prepared (except for final mean) in ``call``.
        """
        # [1, units] -> [num tensors, 1, units] -> [1, units]
        inputs = tf.reduce_mean(
            tf.stack([state["tensor"] for state in states]), 0)
        global_state_new, _ = self.global_rnn(inputs, global_state)
        return global_state_new

    def _new_momentum_variance(self, grads, states, states_new):
        """Equation 1, 2, 3, 13.

        Helper function for scaled momentum update
        """
        # Base decay
        # Eq 13
        # [var size, 1] -> [*var shape]
        shape = tf.shape(grads)
        beta_g = tf.reshape(self.beta_g(states["param"]), shape)
        beta_lambda = tf.reshape(self.beta_lambda(states["param"]), shape)

        # New momentum, variance
        # Eq 1, 2
        states_new["scaling"] = [
            rms_momentum(grads,
                         g_bar,
                         lambda_,
                         beta_1=beta_g**(0.5**s),
                         beta_2=beta_lambda**(0.5**s))
            for s, (g_bar, lambda_) in enumerate(states["scaling"])
        ]

        # Scaled momentum
        _m = [
            g_bar / tf.sqrt(lambda_ + self.epsilon)
            for g_bar, lambda_ in states_new["scaling"]
        ]

        # m_t: [timescales, *var shape] -> [var size, timescales]
        return tf.transpose(tf.reshape(tf.stack(_m), [self.timescales, -1]))

    def _relative_log_gradient_magnitude(self, states, states_new):
        """Equation 4.

        Helper function for relative log gradient magnitudes
        """
        log_lambdas = tf.math.log(
            tf.stack([lambda_ for g_bar, lambda_ in states_new["scaling"]]) +
            self.epsilon)
        _gamma = log_lambdas - tf.reduce_mean(log_lambdas, axis=0)

        # gamma_t: [timescales, *var shape] -> [var size, timescales]
        return tf.transpose(tf.reshape(_gamma, [self.timescales, -1]))

    def _parameterized_change(self, param, states, states_new, m):
        """Equation 5, 7, 8.

        Helper function for parameter change explicitly parameterized into
        direction and learning rate

        Notes
        -----
        (1) Direction is no longer explicitly parameterized, as specified by
            appendix D.3 in Wichrowska et al.
        (2) A shortcut connection is include as per appendix B.1.
        """
        # New learning rate
        # Eq 7, 8
        d_eta = tf.reshape(self.delta_nu(states_new["param"]), tf.shape(param))
        eta = d_eta + states["eta_bar"]
        sg = tf.nn.sigmoid(self.gamma)
        states_new["eta_bar"] = (sg * states["eta_bar"] + (1 - sg) * eta)

        # Relative log learning rate
        # Eq Unnamed, end of sec 3.2.4
        states_new["eta_rel"] = tf.reshape(eta - tf.math.reduce_mean(eta),
                                           [-1, 1])

        # Direction
        # Eq 5, using the update given in Appendix D.3
        d_theta = self.d_theta(states_new["param"])

        if self.gradient_shortcut:
            d_theta += self.gradient_shortcut(m)

        return tf.exp(eta) * tf.reshape(d_theta, tf.shape(param))

    def call(self, param, grads, states, global_state, training=False):
        """Optimizer Update.

        Notes
        -----
        The state indices in Wichrowska et al. are incorrect, and should be:
        (1) g_bar^n, lambda^n = EMA(g_bar^n-1, g^n), EMA(lambda^n-1, g^n)
            instead of EMA(..., g^n-1), etc
        (2) h^n = RNN(x^n, h^n-1) instead of h^n+1 = RNN(x^n, h^n)
        Then, the g^n -> g_bar^n, lambda^n -> m^n -> h^n -> d^n data flow
        occurs within the same step instead of across 2 steps. This fix is
        reflected in the original Scale code.

        In order to reduce state size, the state update computation is split:
        (1) Compute beta_g, beta_lambda, m.
        (2) Update Parameter & Tensor RNN.
        (3) Compute eta, d. This step only depends on the parameter RNN,
            so the Global RNN being updated after this does not matter.
        (4) Update Global RNN.
        eta_rel is the only "transient" (i.e. not RNN hidden states, momentum,
        variance, learning rate) product stored in the optimizer state.
        """
        states_new = {}

        # Prerequisites ("Momentum and variance at various timescales")
        # Eq 1, 2, 3, 13
        m = self._new_momentum_variance(grads, states, states_new)

        # Eq 4
        gamma = self._relative_log_gradient_magnitude(states, states_new)

        # Param RNN
        # inputs = [var size, features]
        param_in = tf.concat(
            [
                # x^n:
                m,
                gamma,
                states["eta_rel"],
                # h_tensor: [1, hidden size] -> [var size, hidden size]
                tf.tile(states["tensor"], [tf.size(param), 1]),
                # h_global: [1, hidden size] -> [var size, hidden size]
                tf.tile(global_state, [tf.size(param), 1]),
            ],
            1)

        # RNN Update
        # Eq 10
        states_new["param"], _ = self.param_rnn(param_in, states["param"])
        # Eq 11
        tensor_in = tf.concat([
            tf.math.reduce_mean(states_new["param"], 0, keepdims=True),
            global_state
        ], 1)
        states_new["tensor"], _ = self.tensor_rnn(tensor_in, states["tensor"])

        # Eq 5, 7, 8
        delta_theta = self._parameterized_change(param, states, states_new, m)

        return delta_theta, states_new

    def get_initial_state(self, var):
        """Get initial model state as a dictionary."""
        batch_size = tf.size(var)

        return {
            "scaling": [(tf.zeros(tf.shape(var)), tf.zeros(tf.shape(var)))
                        for s in range(self.timescales)],
            "param":
            self.param_rnn.get_initial_state(batch_size=batch_size,
                                             dtype=tf.float32),
            "tensor":
            self.tensor_rnn.get_initial_state(batch_size=1, dtype=tf.float32),
            "eta_bar":
            tf.random.uniform(shape=tf.shape(var),
                              minval=tf.math.log(self.init_lr[0]),
                              maxval=tf.math.log(self.init_lr[1])),
            "eta_rel":
            tf.zeros([batch_size, 1]),
        }

    def get_initial_state_global(self):
        """Initialize global hidden state."""
        return self.global_rnn.get_initial_state(batch_size=1,
                                                 dtype=tf.float32)
Exemplo n.º 25
0
 def __init__(self, params):
     self.params = params
     self.predict_window_sizes = params['predict_sequence_length']
     self.rnn_cell = GRUCell(self.params['rnn_size'])
     self.dense = Dense(units=1)
     self.attention = Attention(hidden_size=32, num_heads=2, attention_dropout=0.8)
Exemplo n.º 26
0
 def __init__(self, params):
     cell = GRUCell(units=params['rnn_units'])
     self.rnn = RNN(cell, return_state=True, return_sequences=True)
     self.dense = Dense(units=2)
     self.activate = Activate()