def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + ( 1 - self.momentum) * m std_update = self.momentum * self.running_std + ( 1 - self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape( self.beta, broadcast_shape) return out
def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + (1-self.momentum) * m std_update = self.momentum * self.running_std + (1-self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape(self.beta, broadcast_shape) return out
def test_simple_readout(): g1 = dgl.DGLGraph() g1.add_nodes(3) g2 = dgl.DGLGraph() g2.add_nodes(4) # no edges g1.add_edges([0, 1, 2], [2, 0, 1]) n1 = F.randn((3, 5)) n2 = F.randn((4, 5)) e1 = F.randn((3, 5)) s1 = F.sum(n1, 0) # node sums s2 = F.sum(n2, 0) se1 = F.sum(e1, 0) # edge sums m1 = F.mean(n1, 0) # node means m2 = F.mean(n2, 0) me1 = F.mean(e1, 0) # edge means w1 = F.randn((3, )) w2 = F.randn((4, )) max1 = F.max(n1, 0) max2 = F.max(n2, 0) maxe1 = F.max(e1, 0) ws1 = F.sum(n1 * F.unsqueeze(w1, 1), 0) ws2 = F.sum(n2 * F.unsqueeze(w2, 1), 0) wm1 = F.sum(n1 * F.unsqueeze(w1, 1), 0) / F.sum(F.unsqueeze(w1, 1), 0) wm2 = F.sum(n2 * F.unsqueeze(w2, 1), 0) / F.sum(F.unsqueeze(w2, 1), 0) g1.ndata['x'] = n1 g2.ndata['x'] = n2 g1.ndata['w'] = w1 g2.ndata['w'] = w2 g1.edata['x'] = e1 assert F.allclose(dgl.sum_nodes(g1, 'x'), s1) assert F.allclose(dgl.sum_nodes(g1, 'x', 'w'), ws1) assert F.allclose(dgl.sum_edges(g1, 'x'), se1) assert F.allclose(dgl.mean_nodes(g1, 'x'), m1) assert F.allclose(dgl.mean_nodes(g1, 'x', 'w'), wm1) assert F.allclose(dgl.mean_edges(g1, 'x'), me1) assert F.allclose(dgl.max_nodes(g1, 'x'), max1) assert F.allclose(dgl.max_edges(g1, 'x'), maxe1) g = dgl.batch([g1, g2]) s = dgl.sum_nodes(g, 'x') m = dgl.mean_nodes(g, 'x') max_bg = dgl.max_nodes(g, 'x') assert F.allclose(s, F.stack([s1, s2], 0)) assert F.allclose(m, F.stack([m1, m2], 0)) assert F.allclose(max_bg, F.stack([max1, max2], 0)) ws = dgl.sum_nodes(g, 'x', 'w') wm = dgl.mean_nodes(g, 'x', 'w') assert F.allclose(ws, F.stack([ws1, ws2], 0)) assert F.allclose(wm, F.stack([wm1, wm2], 0)) s = dgl.sum_edges(g, 'x') m = dgl.mean_edges(g, 'x') max_bg_e = dgl.max_edges(g, 'x') assert F.allclose(s, F.stack([se1, F.zeros(5)], 0)) assert F.allclose(m, F.stack([me1, F.zeros(5)], 0)) assert F.allclose(max_bg_e, F.stack([maxe1, F.zeros(5)], 0))
def test_simple_pool(): ctx = F.ctx() g = dgl.DGLGraph(nx.path_graph(15)) sum_pool = nn.SumPooling() avg_pool = nn.AvgPooling() max_pool = nn.MaxPooling() sort_pool = nn.SortPooling(10) # k = 10 print(sum_pool, avg_pool, max_pool, sort_pool) # test#1: basic h0 = F.randn((g.number_of_nodes(), 5)) if F.gpu_ctx(): sum_pool = sum_pool.to(ctx) avg_pool = avg_pool.to(ctx) max_pool = max_pool.to(ctx) sort_pool = sort_pool.to(ctx) h0 = h0.to(ctx) h1 = sum_pool(g, h0) assert F.allclose(h1, F.sum(h0, 0)) h1 = avg_pool(g, h0) assert F.allclose(h1, F.mean(h0, 0)) h1 = max_pool(g, h0) assert F.allclose(h1, F.max(h0, 0)) h1 = sort_pool(g, h0) assert h1.shape[0] == 10 * 5 and h1.dim() == 1 # test#2: batched graph g_ = dgl.DGLGraph(nx.path_graph(5)) bg = dgl.batch([g, g_, g, g_, g]) h0 = F.randn((bg.number_of_nodes(), 5)) if F.gpu_ctx(): h0 = h0.to(ctx) h1 = sum_pool(bg, h0) truth = th.stack([F.sum(h0[:15], 0), F.sum(h0[15:20], 0), F.sum(h0[20:35], 0), F.sum(h0[35:40], 0), F.sum(h0[40:55], 0)], 0) assert F.allclose(h1, truth) h1 = avg_pool(bg, h0) truth = th.stack([F.mean(h0[:15], 0), F.mean(h0[15:20], 0), F.mean(h0[20:35], 0), F.mean(h0[35:40], 0), F.mean(h0[40:55], 0)], 0) assert F.allclose(h1, truth) h1 = max_pool(bg, h0) truth = th.stack([F.max(h0[:15], 0), F.max(h0[15:20], 0), F.max(h0[20:35], 0), F.max(h0[35:40], 0), F.max(h0[40:55], 0)], 0) assert F.allclose(h1, truth) h1 = sort_pool(bg, h0) assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.dim() == 2
def batchnorm(X, batch_size, hidden_dim, gamma, beta, running_mean, running_std, epsilon=1e-10, axis=1, momentum=0.99, train=False): X = K.reshape(X, (batch_size, hidden_dim)) input_shape = (batch_size, hidden_dim) # (1, 512) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[axis] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[axis] = input_shape[axis] # [1, 512] if train: m = K.mean( X, axis=reduction_axes ) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 brodcast_m = K.reshape(m, broadcast_shape) # m.shape = (1, 512) std = K.mean(K.square(X - brodcast_m) + epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = momentum * running_mean + (1 - momentum) * m # (1, 512) std_update = momentum * running_std + (1 - momentum) * std # (1, 512) X_normed = (X - brodcast_m) / (brodcast_std + epsilon) # (1, 512) else: brodcast_m = K.reshape(running_mean, broadcast_shape) brodcast_std = K.reshape(running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + epsilon)) out = K.reshape(gamma, broadcast_shape) * X_normed + K.reshape( beta, broadcast_shape) # (1, 512) return out, mean_update, std_update
def test_simple_pool(): g = dgl.DGLGraph(nx.path_graph(15)) sum_pool = nn.SumPooling() avg_pool = nn.AvgPooling() max_pool = nn.MaxPooling() sort_pool = nn.SortPooling(10) # k = 10 print(sum_pool, avg_pool, max_pool, sort_pool) # test#1: basic h0 = F.randn((g.number_of_nodes(), 5)) h1 = sum_pool(g, h0) check_close(F.squeeze(h1, 0), F.sum(h0, 0)) h1 = avg_pool(g, h0) check_close(F.squeeze(h1, 0), F.mean(h0, 0)) h1 = max_pool(g, h0) check_close(F.squeeze(h1, 0), F.max(h0, 0)) h1 = sort_pool(g, h0) assert h1.shape[0] == 1 and h1.shape[1] == 10 * 5 and h1.ndim == 2 # test#2: batched graph g_ = dgl.DGLGraph(nx.path_graph(5)) bg = dgl.batch([g, g_, g, g_, g]) h0 = F.randn((bg.number_of_nodes(), 5)) h1 = sum_pool(bg, h0) truth = mx.nd.stack(F.sum(h0[:15], 0), F.sum(h0[15:20], 0), F.sum(h0[20:35], 0), F.sum(h0[35:40], 0), F.sum(h0[40:55], 0), axis=0) check_close(h1, truth) h1 = avg_pool(bg, h0) truth = mx.nd.stack(F.mean(h0[:15], 0), F.mean(h0[15:20], 0), F.mean(h0[20:35], 0), F.mean(h0[35:40], 0), F.mean(h0[40:55], 0), axis=0) check_close(h1, truth) h1 = max_pool(bg, h0) truth = mx.nd.stack(F.max(h0[:15], 0), F.max(h0[15:20], 0), F.max(h0[20:35], 0), F.max(h0[35:40], 0), F.max(h0[40:55], 0), axis=0) check_close(h1, truth) h1 = sort_pool(bg, h0) assert h1.shape[0] == 5 and h1.shape[1] == 10 * 5 and h1.ndim == 2
def is_divergence(y_pred, y_gt): y_pred = K.clip(y_pred, _EPSILON, np.inf) y_gt = K.clip(y_gt, _EPSILON, np.inf) is_mat = y_gt / y_pred - K.log(y_gt / y_pred) - 1 return K.mean(K.sum(is_mat, axis=-1))
def kl_divergence(y_pred, y_gt): y_pred = K.clip(y_pred, _EPSILON, np.inf) y_gt = K.clip(y_gt, _EPSILON, np.inf) kl_mat = y_gt * K.log(y_gt / y_pred) - y_gt + y_pred return K.mean(K.sum(kl_mat, axis=-1))
def norm_lp(y_pred, y_gt, norm): return K.mean(K.sum(K.power(K.abs(y_pred - y_gt), norm), axis=-1))
def mse(y_pred, y_gt): return K.mean(K.sqr(y_pred - y_gt))
def binary_crossentropy(p_y_pred, y_gt): p_y_pred = K.clip(p_y_pred, _EPSILON, 1. - _EPSILON) return K.mean(K.mean(K.binary_crossentropy(p_y_pred, y_gt), axis=-1))
def udf_mean(nodes): return {'r2': F.mean(nodes.mailbox['m'], 1)}
def __call__(self, loss): output = self.layer.get_output(True) loss += self.l1 * K.sum(K.mean(K.abs(output), axis=0)) loss += self.l2 * K.sum(K.mean(K.square(output), axis=0)) return loss
def step(self, cell_p, hid_p, mean_p, std_p): embed = T.reshape(T.dot(self.attribute[:, 0], self.params['W_ctx_3']), [self.batch_size, 10]) hidP = T.dot(hid_p, self.params['W_ctx_2']) # (25, 10) embedd = T.repeat(self.params['W_ctx_1'], self.batch_size, 0) * T.tanh( embed + hidP + T.repeat(self.params['b_ctx'], self.batch_size, 0)) # (25, 10) alpha_base = T.reshape(T.exp(embedd), [self.batch_size, 10, 1]) # (25, 10, 1) alpha_base = alpha_base / alpha_base.sum() att = T.reshape(self.attribute[:, 0], [self.batch_size, 10, self.att_frame]) ctx = (alpha_base * att / T.reshape(alpha_base.sum(axis=1), [self.batch_size, 1, 1])).sum( axis=1) # (25, 300) ctx = T.reshape(ctx, [self.batch_size, self.att_frame]) # ctx += T.dot(hid_p, self.params['W_att']) + T.repeat(self.params['b_att'], self.batch_size, 0) input_to = T.dot(ctx, self.params['W_in']) + T.repeat( self.params['b'], self.batch_size, 0) # (25, 2048) # input_to_i = T.dot(ctx, self.params['W_in_i']) + T.repeat(self.params['b_i'], self.batch_size, 0) # input_to_f = T.dot(ctx, self.params['W_in_f']) + T.repeat(self.params['b_f'], self.batch_size, 0) # input_to_o = T.dot(ctx, self.params['W_in_o']) + T.repeat(self.params['b_o'], self.batch_size, 0) # input_to_c = T.dot(ctx, self.params['W_in_c']) + T.repeat(self.params['b_c'], self.batch_size, 0) gate = input_to + T.dot(hid_p, self.params['W_hid']) # gate_i = input_to_i + T.dot(hid_p, self.params['W_hid_i']) # gate_f = input_to_f + T.dot(hid_p, self.params['W_hid_f']) # gate_o = input_to_o + T.dot(hid_p, self.params['W_hid_o']) # gate_c = input_to_c + T.dot(hid_p, self.params['W_hid_c']) # Apply nonlinearities ingate = T.nnet.sigmoid( self._slice(gate, 0, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][0], self.batch_size, 0)) forgetgate = T.nnet.sigmoid( self._slice(gate, 1, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][1], self.batch_size, 0)) cell_input = T.tanh(self._slice(gate, 2, self.hidden_dim)) # Compute new cell value cell = forgetgate * cell_p + ingate * cell_input # BatchNormalization input_shape = (self.batch_size, self.hidden_dim) # (1, 512) cell = K.reshape(cell, input_shape) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[self.axis_bn] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[self.axis_bn] = input_shape[self.axis_bn] # [1, 512] # m = K.mean(cell, axis=reduction_axes) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 m = K.mean(cell, axis=0) brodcast_m = K.reshape(m, [1, self.hidden_dim]) # m.shape = (1, 512) # brodcast_m = m std = K.mean(K.square(cell - brodcast_m) + self.epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = self.momentum * mean_p + (1 - self.momentum) * m # (1, 512) std_update = self.momentum * std_p + (1 - self.momentum) * std # (1, 512) cell_normed = (cell - brodcast_m) / (brodcast_std + self.epsilon ) # (1, 512) cell_bn = K.reshape( self.params['gamma'], broadcast_shape) * cell_normed + K.reshape( self.params['beta'], broadcast_shape) # (1, 512) # cell_bn, mean, std = batchnorm(cell, self.batch_size, self.hidden_dim, self.params['gamma'], self.params['beta'], mean_p, std_p, train=True) outgate = T.nnet.sigmoid( self._slice(gate, 3, self.hidden_dim) + cell_bn * T.repeat(self.params['W_cell'][2], self.batch_size, 0)) # Compute new hidden unit activation hid = outgate * T.tanh(cell_bn) return T.reshape( cell_bn, [self.batch_size, self.hidden_dim]), T.reshape( hid, [self.batch_size, self.hidden_dim]), mean_update, std_update
def mse( y_pred, y_gt ): return K.mean( K.sum( K.sqr( y_pred - y_gt ), axis=-1 ) )
def beta_divergence(y_pred, y_gt, beta): y_pred = K.clip(y_pred, _EPSILON, np.inf) y_gt = K.clip(y_gt, _EPSILON, np.inf) beta_mat = 1. / (beta*(beta-1)) * (K.power(y_gt, beta) + (beta-1) * K.power(y_pred, beta) - beta * y_gt * K.power(y_pred, (beta-1))) return K.mean(K.sum(beta_mat, axis=-1))
def categorical_error(p_y_pred, y_gt): y_pred_sparse = K.argmax(p_y_pred, axis=-1) y_gt_sparse = K.argmax(y_gt, axis=-1) return K.mean(K.neq(y_pred_sparse, y_gt_sparse))
def categorical_crossentropy(p_y_pred, y_gt): p_y_pred = K.clip(p_y_pred, _EPSILON, 1. - _EPSILON) return K.mean(K.categorical_crossentropy(p_y_pred, y_gt))
def sparse_categorical_crossentropy(p_y_pred, y_gt): p_y_pred = K.clip(p_y_pred, _EPSILON, 1. - _EPSILON) y_gt = K.to_one_hot(y_gt, ) return K.mean(K.categorical_crossentropy(p_y_pred, y_gt))