def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r)
            + K.dot(stm, self.U_r)
            + K.dot(context, self.C_r)
            + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z)
            + K.dot(stm, self.U_z)
            + K.dot(context, self.C_z)
            + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p)
            + K.dot((rt * stm), self.U_p)
            + K.dot(context, self.C_p)
            + self.b_p)

        # new hidden state:
        st = (1-zt)*stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o)
            + K.dot(stm, self.U_o)
            + K.dot(context, self.C_o)
            + self.b_o)

        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]
예제 #2
0
    def _step(self,
        xi_t, xf_t, xc_t, xo_t,
        h_tm1, c_tm1,
        u_i, u_f, u_o, u_c):

        i_t = hard_sigmoid(xi_t + T.dot(h_tm1, u_i))
        f_t = hard_sigmoid(xf_t + T.dot(h_tm1, u_f))
        c_t = f_t * c_tm1 + i_t * tanh(xc_t + T.dot(h_tm1, u_c))
        o_t = hard_sigmoid(xo_t + T.dot(h_tm1, u_o))
        h_t = o_t * tanh(c_t)
        return h_t, c_t
예제 #3
0
def test_tanh():
    test_values = get_standard_values()

    x = K.placeholder(ndim=2)
    exp = activations.tanh(x)
    f = K.function([x], [exp])

    result = f([test_values])[0]
    expected = np.tanh(test_values)
    assert_allclose(result, expected, rtol=1e-05)
            def step_backward(inputs, states):
                h_tm1 = states[0]  # previous memory state
                c_tm1 = states[1]  # previous carry state

                x_i = tf.tensordot(inputs, self.kernel_i_backward,axes=[[2],[0]])
                x_f = tf.tensordot(inputs, self.kernel_f_backward,axes=[[2],[0]])
                x_c = tf.tensordot(inputs, self.kernel_c_backward,axes=[[2],[0]])
                x_o = tf.tensordot(inputs, self.kernel_o_backward,axes=[[2],[0]])
                x_i = K.bias_add(x_i, self.bias_i_backward)
                x_f = K.bias_add(x_f, self.bias_f_backward)
                x_c = K.bias_add(x_c, self.bias_c_backward)
                x_o = K.bias_add(x_o, self.bias_o_backward)
                i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_i_backward,axes=[[2],[0]]))
                f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_f_backward,axes=[[2],[0]]))
                c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1,
                                                                self.recurrent_kernel_c_backward,axes=[[2],[0]]))
                o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_o_backward,axes=[[2],[0]]))
                h = o * activations.tanh(c)

                return h, [h, c]
예제 #5
0
    def get_initial_state(self, inputs):
        print('inputs shape:', inputs.get_shape())

        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))

        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])

        return [y0, s0]
    def get_initial_state(self, inputs):
        print('inputs shape:', inputs.get_shape())

        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))

        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])

        return [y0, s0]
    def step(self, x, states):
        y_prev, s_prev = states
        s_all = K.repeat(s_prev, self.timesteps)
        Wa_s_all = K.dot(s_all, self.W_a)
        et = K.dot(activations.tanh(Wa_s_all + self.uh),
                   K.expand_dims(self.V_a))
        #et_sum = K.sum(K.exp(et), axis=1)
        #et_sum_repeated = K.repeat(et_sum, self.timesteps)
        #a_current = et_sum / et_sum_repeated #shape batch_size, timestep, 1
        a_current = activations.softmax(et)
        context = K.squeeze(K.batch_dot(a_current, self.x_seq, axes=1), axis=1)
        #calculate reset gate
        r_current = activations.sigmoid(
            K.dot(y_prev, self.W_r) + K.dot(s_prev, self.U_r) +
            K.dot(context, self.C_r) + self.b_r)

        #calculate update gate
        z_current = activations.sigmoid(
            K.dot(y_prev, self.W_z) + K.dot(s_prev, self.U_z) +
            K.dot(context, self.C_z) + self.b_z)

        #calculate s tilde
        s_tilde = activations.tanh(
            K.dot(y_prev, self.W_c) + K.dot((r_current * s_prev), self.U_c) +
            K.dot(context, self.C_c) + self.b_c)

        s_current = (1 - z_current) * s_prev + z_current * s_tilde

        #calculate output
        y_current = activations.sigmoid(
            K.dot(y_prev, self.W_o) + K.dot(s_current, self.U_o) +
            K.dot(context, self.C_o) + self.b_o)

        if self.return_attention_weights:
            return a_current, [y_current, s_current]
        else:
            return y_current, [y_current, s_current]
    def get_initial_state(self, inputs):
        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))

        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])

        # Counter of decoding timestep (for enforcing causality)
        t = K.variable(0, name='decode_t', dtype='int32')

        return [y0, s0, t]
예제 #9
0
    def step(self, x_input, states):
        input_shape = self.input_spec[0].shape
        en_seq = states[-1]
        _, [h, c] = super(PointerLSTM, self).step(x_input, states[:-1])

        # vt*tanh(W1*e+W2*d)
        dec_seq = K.repeat(h, input_shape[1])
        Eij = time_distributed_dense(en_seq, self.W1, output_dim=1)
        Dij = time_distributed_dense(dec_seq, self.W2, output_dim=1)
        U = self.vt * tanh(Eij + Dij)
        U = K.squeeze(U, 2)

        # make probability tensor
        pointer = softmax(U)
        return pointer, [h, c]
    def _compute_energy(self, stm):
        # "concat" energy function
        # energy_i = g * V / |V| * tanh([stm, h_i] * W + b) + r
        _stm = K.dot(stm, self.W_a)

        V_a = self.V_a
        if self.normalize_energy:
            V_a = self.Energy_g * K.l2_normalize(self.V_a)

        et = K.dot(activations.tanh(K.expand_dims(_stm, axis=1) + self._uxpb),
                   K.expand_dims(V_a))

        if self.is_monotonic:
            et += self.Energy_r

        return et
예제 #11
0
 def calc_reduced_value(self, values):
     # Вычисляем новое значение для операции REDUCE, полученное из двух последних векторов из стека
     h = K.concatenate([
         values['stack_current'][:, self.hidden_dim:],
         values['stack_prev'][:, self.hidden_dim:]
     ],
                       axis=1)
     q = K.dot(h, self.W_R) + self.b_R
     q1 = sigmoid(q[:, :4 * self.hidden_dim])
     q2 = tanh(q[:, 4 * self.hidden_dim:])
     c = q1[:, self.hidden_dim:2*self.hidden_dim]*values['stack_current'][:,:self.hidden_dim] + \
         q1[:, 2*self.hidden_dim:3*self.hidden_dim]*values['stack_prev'][:,:self.hidden_dim] + \
         q1[:, :self.hidden_dim]*q2
     h = q1[:, 3 * self.hidden_dim:] * c
     reduced = K.concatenate([c, h], axis=1)
     return reduced
 def call(self, inputs):
     assert isinstance(inputs, list)
     temp = K.dot(inputs[0], self.kernel) 
     F = tf.matmul(temp, tf.transpose(inputs[1], perm = [0, 2, 1]))
     F = activations.tanh(F)
     ap = K.mean(F, axis = -1, keepdims=True)
     aq = K.mean(F, axis = 1, keepdims=True)
     eap = K.exp(ap)
     eaq = K.exp(aq)
     eap /= K.sum(eap, axis = 1, keepdims=True)
     eaq /= K.sum(eaq, axis = -1, keepdims=True)
     output0 = tf.matmul(tf.transpose(inputs[0], perm = [0, 2, 1]), eap)
     output1 = tf.matmul(tf.transpose(inputs[1], perm = [0, 2, 1]), tf.transpose(eaq, perm = [0, 2, 1]))
     output0 = tf.transpose(output0, perm = [0, 2, 1])
     output1 = tf.transpose(output1, perm = [0, 2, 1])
     
     return [output0, output1]
예제 #13
0
def multiway_soft_attention_alignment(input_1, input_2, max_len, dim):
    """Align text representation with neural soft attention"""

    # ----- Bilinear attention ----- #
    # attention = Dot(axes=-1)([input_1,
    #                           Dense(dim)(input_2)])
    attention = Dot(axes=-1)([input_1, input_2])
    bilinear_in1_aligned, bilinear_in2_aligned = weighted([attention, input_1, input_2])
    # ----- Bilinear attention ----- #

    x1 = RepeatVector(n=max_len, axis=2, shape=[-1, max_len, dim])(input_1)
    x2 = RepeatVector(n=max_len, axis=1, shape=[-1, max_len, dim])(input_2)

    # ----- Minus attention ----- #
    attention = Subtract()([x1, x2])
    # attention = Dense(int(dim / 2), activation='tanh')(attention)
    attention = Dense(1)(attention)
    print(np.shape(attention))
    attention = Lambda(lambda x: K.squeeze(x, axis=-1), output_shape=squeeze_output_shape)(attention)
    print(np.shape(attention))
    minus_in1_aligned, minus_in2_aligned = weighted([attention, input_1, input_2])
    # ----- Minus attention ----- #

    # ----- Dot attention ----- #
    attention = Multiply()([x1, x2])
    # attention = Dense(int(dim / 2), activation='tanh')(attention)
    attention = Dense(1)(attention)
    attention = Lambda(lambda x: K.squeeze(x, axis=-1), output_shape=squeeze_output_shape)(attention)
    dot_in1_aligned, dot_in2_aligned = weighted([attention, input_1, input_2])
    # ----- Dot attention ----- #

    # ----- Concat attention ----- #
    # v1 = Dense(int(dim / 2))(x1)   # (?, 43, 43, dim / 2)
    # v2 = Dense(int(dim / 2))(x2)   # (?, 43, 43, dim / 2)
    # attention = Lambda(lambda x: tanh(x), output_shape=unchanged_shape)(Add()([v1, v2]))   # (?, 43, 43, dim / 2)
    attention = Lambda(lambda x: tanh(x), output_shape=unchanged_shape)(Add()([x1, x2]))   # (?, 43, 43, dim / 2)
    attention = Dense(1)(attention)   # (?, 43, 43, 1)
    attention = Lambda(lambda x: K.squeeze(x, axis=-1), output_shape=squeeze_output_shape)(attention)
    concat_in1_aligned, concat_in2_aligned = weighted([attention, input_1, input_2])
    # ----- Concat attention ----- #

    in1_aligned = Concatenate()([dot_in1_aligned, bilinear_in1_aligned, minus_in1_aligned, concat_in1_aligned])
    in2_aligned = Concatenate()([dot_in2_aligned, bilinear_in2_aligned, minus_in2_aligned, concat_in2_aligned])

    return in1_aligned, in2_aligned
예제 #14
0
    def step(self, x_input, states):
        # print "x_input:", x_input, x_input.shape
        # <TensorType(float32, matrix)>

        input_shape = self.input_spec[0].shape
        en_seq = states[-1]
        _, [h, c] = self.cell.call(x_input, states[:-1])

        # vt*tanh(W1*e+W2*d)
        dec_seq = K.repeat(h, input_shape[1])
        Eij = _time_distributed_dense(en_seq, self.W1, output_dim=1)
        Dij = _time_distributed_dense(dec_seq, self.W2, output_dim=1)
        U = self.vt * tanh(Eij + Dij)
        U = K.squeeze(U, 2)

        # make probability tensor
        pointer = softmax(U)
        return pointer, [h, c]
예제 #15
0
    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        return at
def keras_linear_kernel(args, normalize=True, tanh_activation=False):
    """
    Linear kernel:

    $k(x, y) = x^Ty$

    :param args: list of size 2 containing x and y
    :param normalize: if True, normalize the input with l2 before computing the kernel function
    :param tanh_activation: if True apply tanh activation to the output
    :return: The linear kernel between args[0] and args[1]
    """
    X = args[0]
    Y = args[1]
    if normalize:
        X = K.l2_normalize(X, axis=-1)
        Y = K.l2_normalize(Y, axis=-1)
    result = K.dot(X, K.transpose(Y))
    if tanh_activation:
        return tanh(result)
    else:
        return result
예제 #17
0
    def step(self, x_input, states):
        input_shape = self.input_spec[0].shape
        en_seq = states[-1]
        _, [h, c] = super(PointerLSTM, self).step(x_input, states[:-1])

        # vt*tanh(W1*e+W2*d)
        dec_seq = K.repeat(h, input_shape[1])
        #dec_seq = K.repeat(h, 2)
        print ('dec_seq')
        print (dec_seq)
        Eij = time_distributed_dense(en_seq, self.W1, output_dim=1)
        Dij = time_distributed_dense(dec_seq, self.W2, output_dim=1)
        U = self.vt * tanh(Eij + Dij)
        print ('U')
        print (U)
        U = K.squeeze(U, 2)
        print ('U squeezed')
        print (U)
        # make probability tensor
        pointer = softmax(U)
        return pointer, [h, c]
예제 #18
0
    def get_mixture_coef(self, out_tensor):
        """ Parses the output tensor to appropriate mixture density coefficients"""
        # This uses eqns 18 -> 23 of http://arxiv.org/abs/1308.0850.

        # Pen states:
        z_pen_logits = out_tensor[:, :, 0:3]
        # Process outputs into MDN parameters
        M = self.hps['num_mixture']
        dist_params = [out_tensor[:, :, (3 + M * (n - 1)):(3 + M * n)] for n in range(1, 7)]
        z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = dist_params

        # Softmax all the pi's and pen states:
        z_pi = softmax(z_pi)
        z_pen = softmax(z_pen_logits)

        # Exponent the sigmas and also make corr between -1 and 1.
        z_sigma1 = exponential(z_sigma1)
        z_sigma2 = exponential(z_sigma2)
        z_corr = tanh(z_corr)

        r = [z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen, z_pen_logits]
        return r
def keras_chi_square_CPD(args,
                         epsilon=None,
                         tanh_activation=True,
                         normalize=False):
    """
    Chi square kernel (equivalent to `additive_chi2_kernel` in scikit-learn):

    $k(x, y) = -Sum [(x - y)^2 / (x + y)]$

    :param args: list of size 2 containing x and y
    :param epsilon: very small value to add to the denominator so that we do not have zeros here
    :param tanh_activation: if True apply tanh activation to the output
    :param normalize: if True, normalize the input with l2 before computing the kernel function
    :return: The chi square kernel between args[0] and args[1]
    """
    X = args[0]
    Y = args[1]
    if normalize:
        X = K.l2_normalize(X, axis=-1)
        Y = K.l2_normalize(Y, axis=-1)
    # the drawing of the matrix X expanded looks like a wall
    wall = K.expand_dims(X, axis=1)
    # the drawing of the matrix Y expanded looks like a floor
    floor = K.expand_dims(Y, axis=0)
    numerator = K.square((wall - floor))
    denominator = wall + floor
    if epsilon is not None:
        quotient = numerator / (denominator + epsilon)
    else:
        quotient = numerator / denominator
    quotient_without_nan = replace_nan(quotient)
    result = -K.sum(quotient_without_nan, axis=2)
    if tanh_activation:
        return tanh(result)
    else:
        return result
예제 #20
0
acttf = kact.linear(nettf)
# need to convert from TensorFlow tensors to numpy arrays before plotting
# eval() is called because TensorFlow tensors have no values until they are "run"
plt_act(nettf.eval(), acttf.eval(), 'linear activation function')

# relu activation function
acttf = kact.relu(nettf)
plt_act(nettf.eval(), acttf.eval(), 'rectified linear (relu)')

# sigmoid activation function
acttf = kact.sigmoid(nettf)
plt_act(nettf.eval(), acttf.eval(), 'sigmoid')

# hard sigmoid activation function
acttf = kact.hard_sigmoid(nettf)
plt_act(nettf.eval(), acttf.eval(), 'hard sigmoid')

# tanh activation function
acttf = kact.tanh(nettf)
plt_act(nettf.eval(), acttf.eval(), 'tanh')

# softsign activation function
acttf = kact.softsign(nettf)
plt_act(nettf.eval(), acttf.eval(), 'softsign')

# close the TensorFlow session
session.close()

# done
print('Done!')
예제 #21
0
    'celu':
    Lambda(lambda x: tf.nn.crelu(x) * 1.270926833152771),
    'elu':
    Lambda(lambda x: elu(x) * 1.2716004848480225),
    'gelu':
    Lambda(lambda x: gelu(x) * 1.7015043497085571),
    #     'glu': lambda x: jax.nn.glu(x) * 1.8484294414520264,
    'leaky_relu':
    Lambda(lambda x: tf.nn.leaky_relu(x) * 1.70590341091156),
    'log_sigmoid':
    Lambda(lambda x: tf.math.log(tf.nn.sigmoid(x)) * 1.9193484783172607),
    'log_softmax':
    Lambda(lambda x: tf.math.log(tf.nn.softmax(x)) * 1.0002083778381348),
    'relu':
    Lambda(lambda x: relu(x) * 1.7139588594436646),
    'relu6':
    Lambda(lambda x: tf.nn.relu6(x) * 1.7131484746932983),
    'selu':
    Lambda(lambda x: selu(x) * 1.0008515119552612),
    'sigmoid':
    Lambda(lambda x: sigmoid(x) * 4.803835391998291),
    'silu':
    Lambda(lambda x: tf.nn.silu(x) * 1.7881293296813965),
    'soft_sign':
    Lambda(lambda x: tf.nn.softsign(x) * 2.338853120803833),
    'softplus':
    Lambda(lambda x: softplus(x) * 1.9203323125839233),
    'tanh':
    Lambda(lambda x: tanh(x) * 1.5939117670059204),
}
 def step_backward(X, states):
     new_state = activations.tanh(tf.tensordot(X,self.encoder_weight_backward, axes=[[2],[0]]) \
         + tf.tensordot(states[0],self.recurrent_weight_backward, axes=[[2],[0]]))
     return new_state, [new_state]
    def step(self, x, states):
        if self.is_monotonic:
            ytm, stm, timestep, previous_attention = states
        else:
            ytm, stm, timestep = states

        ytm = self.embedding_sublayer(K.cast(ytm, 'int32'))

        if self.recurrent_dropout is not None and 0. < self.recurrent_dropout < 1.:
            stm = K.in_train_phase(K.dropout(stm, self.recurrent_dropout), stm)
            ytm = K.in_train_phase(K.dropout(ytm, self.recurrent_dropout), ytm)

        et = self._compute_energy(stm)

        if self.is_monotonic:
            at = self._compute_probabilities(et, previous_attention)
        else:
            at = self._compute_probabilities(et)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)

        # ~~~> calculate new hidden state

        # first calculate the "r" gate:
        rt = activations.sigmoid(
            K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) +
            K.dot(context, self.C_r) + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) +
            K.dot(context, self.C_z) + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) +
            K.dot(context, self.C_p) + self.b_p)

        # new hidden state:
        st = (1 - zt) * stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o) + K.dot(st, self.U_o) +
            K.dot(context, self.C_o) + self.b_o)

        if self.use_teacher_forcing:
            ys = K.in_train_phase(self.y_true[:, timestep[0]],
                                  K.argmax(yt, axis=-1))
            ys = K.flatten(ys)
        else:
            ys = K.flatten(K.argmax(yt, axis=-1))

        if self.return_probabilities:
            output = at
        else:
            output = yt

        next_states = [ys, st, timestep + 1]
        if self.is_monotonic:
            next_states.append(at)
        return output, next_states
예제 #24
0
    def step(self, x, states):

        # obtain elements of the previous time step.
        ytm, stm = states

        # ##    ##    ##    equation 1    ##    ##    ##    ##    ##

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))

        ##    ##    ##    equation 2     ##    ##    ##    ##    ##

        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        ##    ##    ##    equation 3    ##    ##    ##    ##    ##

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)

        # ~~~> calculate new hidden state
        # equation 4  (reset gate)

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) +
            K.dot(context, self.C_r) + self.b_r)

        # equation 5 (update gate)
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) +
            K.dot(context, self.C_z) + self.b_z)

        # equation 6 (proposal state)
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) +
            K.dot(context, self.C_p) + self.b_p)

        # equation 7 (new hidden states)
        st = (1 - zt) * stm + zt * s_tp

        # equation 8
        # the probability of having each character.
        yt = activations.softmax(
            K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) +
            K.dot(context, self.C_o) + self.b_o)

        # a switch so that we can return the
        # attention for visualizations
        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]
def SummaRuNNer():

    # initialize embedding layers
    embed_layer = TimeDistributed(
        Embedding(vocab_sz,
                  word_embed_dim,
                  embeddings_initializer=Constant(vocab.embedding_matrix),
                  input_length=args.sent_len,
                  trainable=False))

    abs_embed_layer = Embedding(args.doc_len,
                                args.pos_embed_dim,
                                input_length=1,
                                trainable=True)
    rel_embed_layer = Embedding(rel_segments,
                                args.pos_embed_dim,
                                input_length=1,
                                trainable=True)

    # input shape [bs, doc length, sentence length]
    doc_input = Input(shape=(int(args.doc_len), int(args.sent_len)),
                      name='doc_input')

    # word embedding
    word_emb_seq = embed_layer(doc_input)

    # LSTM on each each word - return sequence
    word_LSTM = TimeDistributed(
        Bidirectional(LSTM(args.hidden_sz, return_sequences=True)))
    enc_words = word_LSTM(word_emb_seq)

    avg_pooler = TimeDistributed(AveragePooling1D(args.sent_len))
    pooled_words = Reshape((args.doc_len, 2 * args.hidden_sz),
                           name='sent_pooler')(avg_pooler(enc_words))

    # run another word LSTM so that each sentece is represented by a single vector
    sent_LSTM = Bidirectional(LSTM(args.hidden_sz, return_sequences=True))
    enc_sents = sent_LSTM(pooled_words)

    # create single vector for document
    doc_pooler = AveragePooling1D(args.doc_len)
    doc = Flatten(name='flatten_doc')(doc_pooler(enc_sents))
    d = Dense(int(2 * args.hidden_sz), activation='tanh',
              name='dense_doc')(doc)

    # novelty tracker
    s = Lambda(lambda x: K.zeros_like(x), name='s_tensor')(d)  # [?, 2*h]

    probs = []
    # placeholder
    T = Lambda(lambda x: (K.ones_like(x[:, 0:1], name='T_tensor')))(s)

    split_sentences = Lambda(
        lambda tensor, doc_len: tf.unstack(tensor, doc_len, 1),
        arguments={'doc_len': args.doc_len})(enc_sents)

    # run every sentence through classification layer and store probability
    for pos in range(len(split_sentences)):

        sent = Lambda(lambda sentences, pos: sentences[pos],
                      arguments={'pos': pos})(split_sentences)

        # run the absolute embedding
        abs_idx = Lambda(lambda T, pos: T * pos, arguments={'pos': pos})(T)
        abs_emb = Reshape((args.pos_embed_dim, ),
                          name='abs_' + str(pos))(abs_embed_layer(abs_idx))
        """
        get relative position and run through relative embedding
        refers to a quantized representation that divides each document into a 
        fixed number of segments and computes the segment ID of a given sentence.
        """
        rel_idx = math.floor(((pos + (rel_segments) / 2) / args.doc_len) *
                             10)  # only works for rel_segments = 10
        rel_idx = Lambda(lambda T, rel_idx: T * (rel_idx - 1),
                         arguments={'rel_idx': rel_idx})(T)
        rel_emb = Reshape((args.pos_embed_dim, ),
                          name='rel_' + str(pos))(rel_embed_layer(rel_idx))

        # classifier layer
        content = Dense(2, name='content_' + str(pos))(sent)
        salience = Dense(2, name='salience_' + str(pos))(
            Lambda(lambda x: x[0] * x[1])([sent, d]))
        novelty = Dense(2, name='novelty_' + str(pos))(
            Lambda(lambda x: x[0] * tanh(x[1]))([sent, s]))
        abs_pos = Dense(2, name='abs_pos_' + str(pos))(abs_emb)
        rel_pos = Dense(2, name='rel_pos_' + str(pos))(rel_emb)

        p = Lambda(lambda x: sigmoid(x[0] + x[1] + x[2] + x[3] + x[4]))(
            [content, salience, novelty, abs_pos, rel_pos])
        probs.append(p)

        # extract just the probability of label = 1
        p1 = Lambda(lambda p: p[:, 1:])(p)

        # weighted summation of all sentence encodings until now
        # weight = probability that sentences was part of summary
        s = Lambda(lambda x: x[0] + (x[1] * x[2]))([s, p1, sent])

    output = Reshape((args.doc_len, 2),
                     name='prob_reshape')(concatenate(probs, -1))

    model = Model(inputs=doc_input, outputs=output)
    model.compile(optimizer='sgd',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model
예제 #26
0
    def call(self, a, states):
        c_tm1 = states[:self.nb_layers]
        e_tm1 = states[self.nb_layers:2 * self.nb_layers]
        r_tm1 = states[2 * self.nb_layers:3 * self.nb_layers]

        if self.extrap_start_time is not None:
            t = states[-1]
            # The previous prediction will be treated as the actual if t between t_extrap_start and t_extrap_end
            a = K.switch(
                tf.logical_and(t >= self.t_extrap_start,
                               t < self.t_extrap_end), states[-2], a)

        c = []
        r = []
        e = []

        # Update R units starting from the top
        for l in reversed(range(self.nb_layers)):
            inputs = [r_tm1[l], e_tm1[l]]
            if l < self.nb_layers - 1:
                inputs.append(_r)

            inputs = K.concatenate(inputs)
            i = self.layers['i'][l].call(inputs)
            f = self.layers['f'][l].call(inputs)
            o = self.layers['o'][l].call(inputs)
            _c = f * c_tm1[l] + i * self.layers['c'][l].call(inputs)
            if l == 0:
                _r = o * _c
            else:
                _r = o * activations.tanh(_c)
            c.insert(0, _c)
            r.insert(0, _r)

        # Update feed-forward path starting from the bottom
        for l in range(self.nb_layers):
            ahat = self.layers['ahat'][l].call(r[l])
            if l == 0:
                prediction = ahat

            # compute errors
            e_up = activations.relu(ahat - a)
            e_down = activations.relu(a - ahat)

            e.append(K.concatenate([e_up, e_down]))

            if l < self.nb_layers - 1:
                a = self.layers['a'][l].call(e[l])

        if self.output_mode == 'prediction':
            output = prediction
        else:
            for l in range(self.nb_layers):
                layer_error = K.mean(K.batch_flatten(e[l]),
                                     axis=-1,
                                     keepdims=True)
                all_error = layer_error if l == 0 else K.concatenate(
                    [all_error, layer_error])
            if self.output_mode == 'error':
                output = all_error
            else:
                output = K.concatenate([prediction, all_error])

        states = c + e + r
        if self.extrap_start_time is not None:
            states += [prediction, t + 1]
        return output, states
예제 #27
0
    def call(self, inputs, states, training=None):
        if 0 < self.dropout < 1 and self._dropout_mask is None:
            self._dropout_mask = _generate_dropout_mask(
                K.ones_like(inputs),
                self.dropout,
                training=training,
                count=4)
        if (0 < self.recurrent_dropout < 1 and
                self._recurrent_dropout_mask is None):
            self._recurrent_dropout_mask = _generate_dropout_mask(
                K.ones_like(states[0]),
                self.recurrent_dropout,
                training=training,
                count=4)

        # dropout matrices for input units
        dp_mask = self._dropout_mask
        # dropout matrices for recurrent units
        rec_dp_mask = self._recurrent_dropout_mask

        h_tm1 = states[0]  # previous memory state
        c_tm1 = states[1]  # previous carry state

        # repeat the hidden state to the length of the sequence
        _htm1 = K.repeat(h_tm1, self.seq_len)

        _Whtm1 = K.dot(_htm1, self.W_a)
        _Uinpt = K.dot(self._seq_input, self.U_a)

        # calculate the attention probabilities
        et = K.dot(activations.tanh(_Whtm1 + _Uinpt), K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.seq_len)
        at /= at_sum_repeated  # (batch_size, seq_len, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self._seq_input, axes=1), axis=1)

        if self.implementation == 1:
            if 0 < self.dropout < 1.:
                inputs_i = context * dp_mask[0]
                inputs_f = context * dp_mask[1]
                inputs_c = context * dp_mask[2]
                inputs_o = context * dp_mask[3]
            else:
                inputs_i = context
                inputs_f = context
                inputs_c = context
                inputs_o = context
            x_i = K.dot(inputs_i, self.kernel_i)
            x_f = K.dot(inputs_f, self.kernel_f)
            x_c = K.dot(inputs_c, self.kernel_c)
            x_o = K.dot(inputs_o, self.kernel_o)
            if self.use_bias:
                x_i = K.bias_add(x_i, self.bias_i)
                x_f = K.bias_add(x_f, self.bias_f)
                x_c = K.bias_add(x_c, self.bias_c)
                x_o = K.bias_add(x_o, self.bias_o)

            if 0 < self.recurrent_dropout < 1.:
                h_tm1_i = h_tm1 * rec_dp_mask[0]
                h_tm1_f = h_tm1 * rec_dp_mask[1]
                h_tm1_c = h_tm1 * rec_dp_mask[2]
                h_tm1_o = h_tm1 * rec_dp_mask[3]
            else:
                h_tm1_i = h_tm1
                h_tm1_f = h_tm1
                h_tm1_c = h_tm1
                h_tm1_o = h_tm1
            i = self.recurrent_activation(x_i + K.dot(h_tm1_i,
                                                      self.recurrent_kernel_i))
            f = self.recurrent_activation(x_f + K.dot(h_tm1_f,
                                                      self.recurrent_kernel_f))
            c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c,
                                                            self.recurrent_kernel_c))
            o = self.recurrent_activation(x_o + K.dot(h_tm1_o,
                                                      self.recurrent_kernel_o))
        else:
            if 0. < self.dropout < 1.:
                inputs *= dp_mask[0]
            z = K.dot(inputs, self.kernel)
            if 0. < self.recurrent_dropout < 1.:
                h_tm1 *= rec_dp_mask[0]
            z += K.dot(h_tm1, self.recurrent_kernel)
            if self.use_bias:
                z = K.bias_add(z, self.bias)

            z0 = z[:, :self.units]
            z1 = z[:, self.units: 2 * self.units]
            z2 = z[:, 2 * self.units: 3 * self.units]
            z3 = z[:, 3 * self.units:]

            i = self.recurrent_activation(z0)
            f = self.recurrent_activation(z1)
            c = f * c_tm1 + i * self.activation(z2)
            o = self.recurrent_activation(z3)

        h = o * self.activation(c)
        if 0 < self.dropout + self.recurrent_dropout:
            if training is None:
                h._uses_learning_phase = True
        self.attention_weight.append(at)

        return h, [h, c]
예제 #28
0
파일: ntm.py 프로젝트: imandr/ntm_keras
    def _split_and_apply_activations(self, controller_output):
        """ This takes the controller output, splits it in ntm_output, read and wright adressing data.
            It returns a triple of ntm_output, controller_instructions_read, controller_instructions_write.
            ntm_output is a tensor, controller_instructions_read and controller_instructions_write are lists containing
            the adressing instruction (k, beta, g, shift, gamma) and in case of write also the writing constructions,
            consisting of an erase and an add vector. 

            As it is necesseary for stable results,
            k and add_vector is activated via tanh, erase_vector via sigmoid (this is critical!),
            shift via softmax,
            gamma is sigmoided, inversed and clipped (probably not ideal)
            g is sigmoided,
            beta is linear (probably not ideal!) """
        
        # splitting
        ntm_output, controller_instructions_read, controller_instructions_write = tf.split(
                    controller_output,
                    np.asarray([self.output_dim,
                                self.read_heads * self.controller_read_head_emitting_dim,
                                self.write_heads * self.controller_write_head_emitting_dim]),
                    axis=1)

        controller_instructions_read = tf.split(controller_instructions_read, self.read_heads, axis=1)
        controller_instructions_write = tf.split(controller_instructions_write, self.write_heads, axis=1)

        controller_instructions_read = [
                tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1]), axis=1) for 
                single_head_data in controller_instructions_read]
        
        controller_instructions_write = [
                tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1, self.m_depth, self.m_depth]), axis=1) for 
                single_head_data in controller_instructions_write]
        
        #activation
        ntm_output = self.activation(ntm_output)
        # original activations, IVM
        #controller_instructions_read = [(tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), 1 + 9*sigmoid(gamma)) for
        #        (k, beta, g, shift, gamma) in controller_instructions_read]
        #controller_instructions_write = [
        #        (tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), 
        #       1 + 9*sigmoid(gamma), hard_sigmoid(erase_vector), tanh(add_vector))  
        #       for (k, beta, g, shift, gamma, erase_vector, add_vector) in controller_instructions_write]
        
        # IVM activations
        controller_instructions_read = [
                (
                    tanh(k),                            # key
                    softplus(beta),                     # beta, content based similarity 
                    sigmoid(g),                         # interpolation
                    softmax(shift),                     # shift filter
                    1 + softplus(gamma)                 # gamma, focus sharpening
                )
                for (k, beta, g, shift, gamma) in controller_instructions_read]

        controller_instructions_write = [
                (
                    tanh(k),                            # key
                    softplus(beta),                     # beta
                    sigmoid(g),                         # interpolation
                    softmax(shift),                     # shift filter
                    1 + softplus(gamma),                # gamma, focus sharpening
                    sigmoid(erase_vector),              # erase
                    tanh(add_vector)                    # add
                )  for (k, beta, g, shift, gamma, erase_vector, add_vector) in controller_instructions_write]
       
        return (ntm_output, controller_instructions_read, controller_instructions_write)
    def step(self, x, states):

        ytm, stm, t = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))

        if self.causal and not self.use_attention_horizon:
            is_future = K.greater(self._input_t, t)
            mask = K.cast(is_future, 'float32') * -10e9
            et = et + K.expand_dims(K.expand_dims(mask, -1), 0)
        elif self.causal and self.use_attention_horizon:
            is_future = K.greater(self._input_t, t)
            is_beyond_horizon = K.less(self._input_t, t - self.attn_horizon)
            mask_future = K.cast(is_future, 'float32') * -10e9
            mask_past = K.cast(is_beyond_horizon, 'float32') * -10e9
            mask = mask_future + mask_past
            et = et + K.expand_dims(K.expand_dims(mask, -1), 0)

        at = K.softmax(et, axis=1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) +
            K.dot(context, self.C_r) + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) +
            K.dot(context, self.C_z) + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) +
            K.dot(context, self.C_p) + self.b_p)

        # new hidden state:
        st = (1 - zt) * stm + zt * s_tp

        yt = self.activation(
            K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) +
            K.dot(context, self.C_o) + self.b_o)

        t += 1

        if self.return_probabilities:
            return at, [yt, st, t]
        else:
            return yt, [yt, st, t]
예제 #30
0
    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence

        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state

        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities

        # this relates how much other timesteps contributed to this one.

        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))

        at = K.exp(et)

        at_sum = K.sum(at, axis=1)

        at_sum_repeated = K.repeat(at_sum, self.timesteps)

        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector

        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)

        # ~~~> calculate new hidden state

        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) +
            K.dot(context, self.C_r) + self.b_r)

        # now calculate the "z" gate

        zt = activations.sigmoid(
            K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) +
            K.dot(context, self.C_z) + self.b_z)

        # calculate the proposal hidden state:

        s_tp = activations.tanh(
            K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) +
            K.dot(context, self.C_p) + self.b_p)

        # new hidden state:

        st = (1 - zt) * stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) +
            K.dot(context, self.C_o) + self.b_o)

        if self.return_probabilities:

            return at, [yt, st]

        else:

            return yt, [yt, st]
#dec_seq = Reshape((-1,1,latent_dim))(dec_seq) 
#dec_seq = K.squeeze(dec_seq,0)
print ("dec_seq")
print (dec_seq)

blendW1 = TimeDistributed(Dense(latent_dim))(en_seq)
#blendW1 = TimeDistributed(Dense(latent_dim)(en_seq) #?,input_seq_length,latent_dim
print ('blendW1')
print (blendW1)

#blendW2 = TimeDistributed(Dense(latent_dim),ouput_dim=1)(dec_seq)
blendW2 = TimeDistributed(Dense(latent_dim))(dec_seq)
print ('blendW2')
print (blendW2)

blend3 = tanh(blendW1+blendW2)
print ("blend3")
print (blend3)
#blend3 = K.squeeze(blend3,0)
#print ("blend3 squeezed")
#print (blend3)
U = dot([blend3,vt],(0,1))
print ('U')
print (U)
U = K.squeeze(U, 0)
print ('U squeezed')
print (U)
# make probability tensor

decoder_dense = Dense(num_encoder_tokens, activation='softmax')
outputs = decoder_dense(U)
예제 #32
0
    def call(self, inputs, states, training=None):

        if 0 < self.dropout < 1 and self._dropout_mask is None:
            self._dropout_mask = _generate_dropout_mask(
                _generate_dropout_ones(inputs, K.shape(inputs)[-1] + self.annotation_units),
                self.dropout,
                training=training,
                count=4)

        if (0 < self.recurrent_dropout < 1 and
                self._recurrent_dropout_mask is None):
            self._recurrent_dropout_mask = _generate_dropout_mask(
                _generate_dropout_ones(inputs, self.units),
                self.recurrent_dropout,
                training=training,
                count=4)

        # dropout matrices for input units
        dp_mask = self._dropout_mask
        # dropout matrices for recurrent units
        rec_dp_mask = self._recurrent_dropout_mask

        h_tm1 = states[0]  # previous memory state
        c_tm1 = states[1]  # previous carry state

        # attention mechanism

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(h_tm1, self.annotation_timesteps)

        # multiplty the weight matrix with the repeated (current) hidden state
        _Wxstm = K.dot(_stm, self.kernel_w)

        # calculate the attention probabilities
        et = K.dot(activations.tanh(_Wxstm + self._uh), K.expand_dims(self.kernel_v))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.annotation_timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.annotations, axes=1), axis=1)

        # append the context vector to the inputs
        inputs = K.concatenate([inputs, context])

        if self.implementation == 1:
            if 0 < self.dropout < 1.:
                inputs_i = inputs * dp_mask[0]
                inputs_f = inputs * dp_mask[1]
                inputs_c = inputs * dp_mask[2]
                inputs_o = inputs * dp_mask[3]
            else:
                inputs_i = inputs
                inputs_f = inputs
                inputs_c = inputs
                inputs_o = inputs

            x_i = K.dot(inputs_i, self.kernel_i)
            x_f = K.dot(inputs_f, self.kernel_f)
            x_c = K.dot(inputs_c, self.kernel_c)
            x_o = K.dot(inputs_o, self.kernel_o)

            if self.use_bias:
                x_i = K.bias_add(x_i, self.bias_i)
                x_f = K.bias_add(x_f, self.bias_f)
                x_c = K.bias_add(x_c, self.bias_c)
                x_o = K.bias_add(x_o, self.bias_o)

            if 0 < self.recurrent_dropout < 1.:
                h_tm1_i = h_tm1 * rec_dp_mask[0]
                h_tm1_f = h_tm1 * rec_dp_mask[1]
                h_tm1_c = h_tm1 * rec_dp_mask[2]
                h_tm1_o = h_tm1 * rec_dp_mask[3]
            else:
                h_tm1_i = h_tm1
                h_tm1_f = h_tm1
                h_tm1_c = h_tm1
                h_tm1_o = h_tm1

            i = self.recurrent_activation(x_i + K.dot(h_tm1_i, self.recurrent_kernel_i))
            f = self.recurrent_activation(x_f + K.dot(h_tm1_f, self.recurrent_kernel_f))
            c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c, self.recurrent_kernel_c))
            o = self.recurrent_activation(x_o + K.dot(h_tm1_o, self.recurrent_kernel_o))

        else:
            if 0. < self.dropout < 1.:
                inputs *= dp_mask[0]
            z = K.dot(inputs, self.kernel)
            if 0. < self.recurrent_dropout < 1.:
                h_tm1 *= rec_dp_mask[0]
            z += K.dot(h_tm1, self.recurrent_kernel)
            if self.use_bias:
                z = K.bias_add(z, self.bias)

            z0 = z[:, :self.units]
            z1 = z[:, self.units: 2 * self.units]
            z2 = z[:, 2 * self.units: 3 * self.units]
            z3 = z[:, 3 * self.units:]

            i = self.recurrent_activation(z0)
            f = self.recurrent_activation(z1)
            c = f * c_tm1 + i * self.activation(z2)
            o = self.recurrent_activation(z3)

        h = o * self.activation(c)
        if 0 < self.dropout + self.recurrent_dropout:
            if training is None:
                h._uses_learning_phase = True
        return h, [h, c]
예제 #33
0
def rating(Goz, discriminator):
    DoGoz = discriminator(Goz)
    loss = AGAN.loss_G(DoGoz)

    return tanh(loss)