def test_value_manipulation(self): val = np.random.random((4, 2)) xth = KTH.variable(val) xtf = KTF.variable(val) # get_value valth = KTH.get_value(xth) valtf = KTF.get_value(xtf) assert valtf.shape == valth.shape assert_allclose(valth, valtf, atol=1e-05) # set_value val = np.random.random((4, 2)) KTH.set_value(xth, val) KTF.set_value(xtf, val) valth = KTH.get_value(xth) valtf = KTF.get_value(xtf) assert valtf.shape == valth.shape assert_allclose(valth, valtf, atol=1e-05) # count_params assert KTH.count_params(xth) == KTF.count_params(xtf) # print_tensor check_single_tensor_operation('print_tensor', ()) check_single_tensor_operation('print_tensor', (2,)) check_single_tensor_operation('print_tensor', (4, 3)) check_single_tensor_operation('print_tensor', (1, 2, 3)) val = np.random.random((3, 2)) xth = KTH.variable(val) xtf = KTF.variable(val) assert KTH.get_variable_shape(xth) == KTF.get_variable_shape(xtf)
def test_value_manipulation(self): val = np.random.random((4, 2)) xth = KTH.variable(val) xtf = KTF.variable(val) # get_value valth = KTH.get_value(xth) valtf = KTF.get_value(xtf) assert valtf.shape == valth.shape assert_allclose(valth, valtf, atol=1e-05) # set_value val = np.random.random((4, 2)) KTH.set_value(xth, val) KTF.set_value(xtf, val) valth = KTH.get_value(xth) valtf = KTF.get_value(xtf) assert valtf.shape == valth.shape assert_allclose(valth, valtf, atol=1e-05) # count_params assert KTH.count_params(xth) == KTF.count_params(xtf) # print_tensor check_single_tensor_operation('print_tensor', ()) check_single_tensor_operation('print_tensor', (2, )) check_single_tensor_operation('print_tensor', (4, 3)) check_single_tensor_operation('print_tensor', (1, 2, 3)) val = np.random.random((3, 2)) xth = KTH.variable(val) xtf = KTF.variable(val) assert KTH.get_variable_shape(xth) == KTF.get_variable_shape(xtf)
def denseloss(y_true, y_pred, e=1000): Le = KTF.mean(KTF.square(y_pred-y_true), axis=-1) Lc = get_avgpoolLoss(y_true, y_pred, 1) Lc += get_avgpoolLoss(y_true, y_pred, 2) Lc += get_avgpoolLoss(y_true, y_pred, 4) shp = KTF.get_variable_shape(y_pred) Lc = Lc / (shp[1] * shp[2]) return Le + e * Lc
def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler): ps = K.get_variable_shape(new_V_param) norm_axes = [i for i in range(len(ps) - 1)] # update W and V_scaler new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes)) new_V_scaler = new_g_param / new_V_norm new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param updates.append(K.update(W, new_W)) updates.append(K.update(V_scaler, new_V_scaler))
def get_weightnorm_params_and_grads(p, g): ps = K.get_variable_shape(p) # construct weight scaler: V_scaler = g/||V|| V_scaler_shape = (ps[-1], ) # assumes we're using tensorflow! V_scaler = K.ones( V_scaler_shape) # init to ones, so effective parameters don't change # get V parameters = ||V||/g * W norm_axes = [i for i in range(len(ps) - 1)] V = p / tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) # split V_scaler into ||V|| and g parameters V_norm = tf.sqrt(tf.reduce_sum(tf.square(V), norm_axes)) g_param = V_scaler * V_norm # get grad in V,g parameters grad_g = tf.reduce_sum(g * V, norm_axes) / V_norm grad_V = tf.reshape(V_scaler, [1] * len(norm_axes) + [-1]) * \ (g - tf.reshape(grad_g / V_norm, [1] * len(norm_axes) + [-1]) * V) return V, V_norm, V_scaler, g_param, grad_g, grad_V
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / ( 1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # if a weight tensor (len > 1) use weight normalized parameterization # this is the only part changed w.r.t. keras.optimizers.Adam ps = K.get_variable_shape(p) if len(ps) > 1: # get weight normalization parameters V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads( p, g) # Adam containers for the 'g' parameter V_scaler_shape = K.get_variable_shape(V_scaler) m_g = K.zeros(V_scaler_shape) v_g = K.zeros(V_scaler_shape) # update g parameters m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g) new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon) self.updates.append(K.update(m_g, m_g_t)) self.updates.append(K.update(v_g, v_g_t)) # update V parameters m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V) new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # if there are constraints we apply them to V, not W # if p in constraints: # c = constraints[p] # new_V_param = c(new_V_param) # wn param updates --> W updates add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) else: # do optimization normally m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints # if p in constraints: # c = constraints[p] # new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates