def call(self, V, training): query = tf.matmul(V, self.wan_query) + self.wan_query_b query = self.bn1(query, training=training) query = activations.elu(query) key = tf.matmul(V, self.wan_key) + self.wan_key_b key = self.bn2(key, training=training) key = tf.transpose(key, [0, 2, 1]) key = activations.elu(key) weights = tf.matmul(query, key) + self.wan_weights_b weights = self.bn3(weights, training=training) #weights = layers.ELU(weights) weights = tf.nn.softmax(weights, axis=1) m1 = tf.matmul(weights, V) m1 = self.bn4(m1, training=training) return activations.elu(m1)
def call(self, inputs, training): x = self.embedding(inputs) x1 = self.wan(x, training) #x1 = self.dp(x1) x2 = self.lan(x1, training) #x2 = self.dp(x2) x = self.can(x1, x2, training) #x = self.dp(x) for _ in range(self.SETTINGS.LAYERS): # MANN layer x1 = self.wan(x, training) x2 = self.lan(x1, training) x = self.can(x1, x2, training) #x = self.pooling(x) #x = self.conv1(x) x = self.conv2(x) x = self.dp(x) x = self.bn1(x, training=training) x = activations.elu(x) #x = self.bn3(x, training=training) x = self.conv3(x) x = self.dp(x) x = self.bn2(x, training=training) x = activations.elu(x) #x = self.bn4(x, training=training) #x = self.conv4(x) #x = self.bn5(x, training=training) x = self.pooling(x) x = self.flatten(x) x = self.dp(x) #x = self.Dense1(x) #x = self.dp(x) #x = self.Dense2(x) #x = self.dp(x) x = self.Dense3(x) x = self.dp(x) x = self.bn3(x, training) x = activations.elu(x) #x = self.dp(x) x = self.Dense4(x) return x
def selu(x): """Scaled Exponential Linear Unit. (Klambauer et al., 2017) # Arguments x: A tensor or variable to compute the activation function for. # References - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515) """ alpha = 1.6732632423543772848170429916717 scale = 1.0507009873554804934193349852946 return scale * elu(x, alpha)
def call(self, m1, m2, training): query = tf.matmul(m1, self.q) + self.q_b query = self.bn1(query, training=training) query = activations.elu(query) key = tf.matmul(m2, self.k) + self.k_b key = self.bn2(key, training=training) key = activations.elu(key) weights = tf.matmul(query, tf.transpose(key, [0, 2, 1])) + self.weights_b weights = self.bn3(weights, training=training) #weights = tf.nn.relu(weights) weights = tf.nn.softmax(weights, axis=1) m3 = tf.matmul( weights, tf.matmul(m2, self.v2) + tf.matmul(weights, tf.matmul(m1, self.v1)) + self.m3_b) m3 = self.bn4(m3, training=training) return activations.elu(m3)
def call(self, m1, training): d = tf.zeros([ self.SETTINGS.BATCH_SIZE, 1, self.SETTINGS.WINDOW_SIZE * self.SETTINGS.EMB_DIM ]) s = math.floor(self.SETTINGS.WINDOW_SIZE / 2) pad = tf.zeros([self.SETTINGS.BATCH_SIZE, s, self.SETTINGS.EMB_DIM]) m1 = tf.concat([pad, m1, pad], axis=1) for i in range(s, self.SETTINGS.MAX_LEN + s): t = tf.reshape(m1[:, i - s:i + s + 1], [ self.SETTINGS.BATCH_SIZE, 1, self.SETTINGS.WINDOW_SIZE * self.SETTINGS.EMB_DIM ]) d = tf.concat([d, t], axis=1) d = d[:, 1:] l = tf.matmul(d, self.f) + self.lan_key_b l = self.bn1(l, training=training) l = activations.elu(l) query_2 = tf.matmul(l, self.lan_query) + self.lan_query_b query_2 = self.bn2(query_2, training=training) query_2 = activations.elu(query_2) key_2 = tf.matmul(l, self.lan_key) + self.lan_key_b key_2 = self.bn3(key_2, training=training) key_2 = activations.elu(key_2) weights = tf.matmul(query_2, tf.transpose( key_2, [0, 2, 1])) + self.lan_weights_b weights = self.bn4(weights, training=training) #weights = layers.ELU(weights) weights = tf.nn.softmax(weights, axis=1) l = tf.matmul(weights, l) l = self.bn5(l, training=training) return activations.elu(l)
def call(self, inputs): mean, var, *adj = inputs # assert len(adj) == 2 mean = activations.elu(mean @ self.kernel_mean) var = activations.relu(var @ self.kernel_var) attention = tf.math.exp(-self.gamma * var) mean = tf.sparse.sparse_dense_matmul(adj[0], mean * attention) var = tf.sparse.sparse_dense_matmul(adj[1], var * attention * attention) if self.use_bias: mean += self.bias_mean var += self.bias_var return self.activation(mean), self.activation(var)
def forward(inputs, weights): # 第一层 x, *adj = inputs h = x @ weights[0] mean = activations.elu(h) var = activations.relu(h) attention = tf.exp(-var) mean = tf.sparse.sparse_dense_matmul(adj[0], mean * attention) var = tf.sparse.sparse_dense_matmul(adj[1], var * attention * attention) mean = activations.elu(mean) var = activations.elu(var) # 中间层 i = 1 while i < len(weights) - 2: mean = activations.elu(mean @ weights[i]) var = activations.relu(var @ weights[i + 1]) attention = tf.math.exp(-var) mean = tf.sparse.sparse_dense_matmul(adj[0], mean * attention) var = tf.sparse.sparse_dense_matmul(adj[1], var * attention * attention) mean = activations.elu(mean) var = activations.elu(var) i += 2 # 输出层 mean = activations.elu(mean @ weights[i]) var = activations.relu(var @ weights[i + 1]) attention = tf.math.exp(-var) mean = tf.sparse.sparse_dense_matmul(adj[0], mean * attention) var = tf.sparse.sparse_dense_matmul(adj[1], var * attention * attention) # 采样层 sample = tf.random.normal(tf.shape(var), 0, 1, dtype='float32') output = mean + tf.math.sqrt(var + 1e-8) * sample return output
def call(self, inputs): x, *adj = inputs # assert len(adj) == 2 mean = tf.slice(x, [0, 0], [-1, self.dim]) var = tf.slice(x, [0, self.dim], [-1, self.dim]) mean = activations.elu(mean @ self.kernel_mean) var = activations.relu(var @ self.kernel_var) attention = tf.math.exp(-self.gamma*var) mean = tf.sparse.sparse_dense_matmul(adj[0], mean * attention) var = tf.sparse.sparse_dense_matmul(adj[1], var * attention * attention) if self.use_bias: mean += self.bias_mean var += self.bias_var sample = tf.random.normal(tf.shape(var), 0, 1, dtype=tf.float32) output = mean + tf.math.sqrt(var + 1e-8) * sample return self.activation(output)
def call(self, inputs): x, *adj = inputs # assert len(adj) == 2 h = x @ self.kernel if self.use_bias: h += self.bias mean = activations.elu(tf.slice(h, [0, 0], [-1, self.dim])) var = activations.relu(tf.slice(h, [0, self.dim], [-1, self.dim])) KL_divergence = 0.5 * tf.reduce_mean(tf.math.square(mean) + var - tf.math.log(1e-8 + var) - 1, axis=1) KL_divergence = tf.reduce_sum(KL_divergence) attention = tf.exp(-self.gamma*var) mean = tf.sparse.sparse_dense_matmul(adj[0], mean * attention) var = tf.sparse.sparse_dense_matmul(adj[1], var * attention * attention) output = tf.concat([mean, var], axis=1) return self.activation(output), KL_divergence
###relu 比 sigmoid 和 tanh 快;(梯度不会饱和,解决了梯度消失问题 ##缺点L:训练的时候很”脆弱”,因为当取负号的时候 ,它的导数为零,预示着后半段就没什么作用了。 x = tf.linspace(-5., 5., 100) # 构造一段连续的数据 x_ndarray = x.numpy() # 转换为 ndarray 的类型 y_relu = activations.relu(x) plt.plot(x, y_relu, c='red', label='relu') # 画折线图 plt.ylim((-0.5, 1.2)) plt.legend(loc='best') plt.show() ##elu为解决ReLU存在的问题而提出。Elu激活函数有优点:ReLU的基本所有优点、不会有Dead ReLU问题,输出的均值接近0、零中心点问题。Elu激活函数有缺点:计算量稍大,原点不可导。 x = tf.linspace(-5., 5., 100) # 构造一段连续的数据 x_ndarray = x.numpy() # 转换为 ndarray 的类型 y_relu = activations.elu(x) plt.plot(x, y_relu, c='red', label='elu') # 画折线图 plt.ylim((-2, 5)) plt.legend(loc='best') plt.show() ###selu ###其实就是ELU乘了个lambda,关键在于这个lambda是大于1的。以前relu,prelu,elu这些激活函数,都是在负半轴坡度平缓,这样在activation的方差过大的时候可以让它减小,防止了梯度爆炸,但是正半轴坡度简单的设成了1。而selu的正半轴大于1,在方差过小的的时候可以让它增大,同时防止了梯度消失。这样激活函数就有一个不动点,网络深了以后每一层的输出都是均值为0方差为1。 x = tf.linspace(-5., 5., 100) # 构造一段连续的数据 x_ndarray = x.numpy() # 转换为 ndarray 的类型 y_relu = activations.selu(x) plt.plot(x, y_relu, c='red', label='selu') # 画折线图 plt.ylim((-2, 5)) plt.legend(loc='best')
# # $$ # g(z) = \frac{e^{z} - e^{-z} }{e^{z} + e^{-z}} # $$ # # The figure below illustrated these functions. # In[2]: from tensorflow.keras import activations import numpy as np import matplotlib.pylab as plt z = np.linspace(-7, 7, 500) h_relu = activations.relu(z).numpy() h_elu = activations.elu(z).numpy() h_selu = activations.selu(z).numpy() h_tanh = activations.tanh(z).numpy() fig, axes = plt.subplots(2, 2, sharex=True, sharey=True) fig.set_figheight(8) fig.set_figwidth(10) axes[0, 0].plot(z, h_relu) axes[0, 0].set_xlabel(r'$z$') axes[0, 0].set_ylabel(r'$g(z)$') axes[0, 0].set_title('ReLU') axes[0, 1].plot(z, h_elu) axes[0, 1].set_xlabel(r'$z$') axes[0, 1].set_ylabel(r'$g(z)$')