def compute_att(res, x_): context = res[2] count = res[3] if self.aggregate == "mean": context_ = tf.cond(tf.less(count, 1), lambda: context / (count + 1.0), lambda: context / count) elif self.aggregate == "sum": context_ = context if self.cut_gradient: context_ = K.stop_gradient(context_) uit = dot_product(x_, self.W) c = dot_product(context_, self.W_context) if self.context_version == "classical": uit = K.tanh(tf.add(uit, K.expand_dims(c, 1)) + self.b) elif self.context_version == "gate": l = K.expand_dims( K.sigmoid( K.expand_dims( dot_product(context_, self.W_lc) + self.bl, 1) + dot_product(x_, self.W_l)), -1) uit = K.tanh( tf.add(l * uit, (1 - l) * K.expand_dims(c, 1)) + self.b) else: uit = K.tanh(uit) ait = dot_product(uit, self.u) a = K.exp(ait) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. a /= K.cast( K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a_ = K.expand_dims(a) weighted_input = x_ * a_ attended = K.sum(weighted_input, axis=1) context *= self.discount context += attended return [attended, a, context, count + 1]
def symbolic_fgs(x, grad, eps=0.3, clipping=True, reverse=False): """ FGSM attack. """ # signed gradient normed_grad = K.sign(grad) # Multiply by constant epsilon scaled_grad = eps * normed_grad # Add perturbation to original example to obtain adversarial example if not reverse: adv_x = K.stop_gradient(x + scaled_grad) else: adv_x = K.stop_gradient(x - scaled_grad) if clipping: adv_x = K.clip(adv_x, 0, 1) return adv_x
def get_factor(loss_func, y_test, y_pred): if loss_func == 'categorical_crossentropy': return K.sum(y_test * y_pred, axis=-1, keepdims=True) elif 'hinge' in loss_func: y_pred_clipped = K.stop_gradient(K.clip(y_pred, -1., 1.)) y_pred_processed = .5 * (y_pred_clipped + 1.0) return K.sum(y_test * y_pred_processed, axis=-1, keepdims=True) elif loss_func == 'mean_squared_error': return 1.0 / (K.square(y_test - y_pred) + 1.0) else: raise Exception('Loss function ' + loss_func + ' not supported')
def optimizer(self): """ Actor Optimization: Advantages + Entropy term to encourage exploration (Cf. https://arxiv.org/abs/1602.01783) """ weighted_actions = K.sum(self.action_pl * self.model.output, axis=1) eligibility = K.log(weighted_actions + 1e-10) * K.stop_gradient(self.advantages_pl) entropy = K.sum(self.model.output * K.log(self.model.output + 1e-10), axis=1) loss = 0.001 * entropy - K.sum(eligibility) updates = self.rms_optimizer.get_updates(self.model.trainable_weights, [], loss) return K.function([self.model.input, self.action_pl, self.advantages_pl], [], updates=updates)
def critic_var_optimizer(self): action = K.stop_gradient(K.placeholder(shape=(None, self.action_size))) advantages = K.stop_gradient(K.placeholder(shape=(None, ))) policy = K.stop_gradient(K.placeholder(shape=(None, self.action_size))) action_var = self.critic_var.output loss = (K.sum(0.8 * (action - policy)**2 - action_var**2, axis=1) * advantages) / K.sum(advantages) entropy = K.sum(action_var * K.log(action_var + 1e-10), axis=1) critic_var_loss = loss + 0.01 * entropy optimizer = Adam(lr=self.critic_var_lr) updates = optimizer.get_updates(self.critic_var.trainable_weights, [], critic_var_loss) train = K.function([self.critic_var.input, action, policy, advantages], [], updates=updates) return train
def call(self, x): # Flatten input except for last dimension. flat_inputs = K.reshape(x, (-1, self.embedding_dim)) # Calculate distances of input to embedding vectors. distances = (K.sum(flat_inputs**2, axis=1, keepdims=True) - 2 * K.dot(flat_inputs, self.w) + K.sum(self.w**2, axis=0, keepdims=True)) # Retrieve encoding indices. encoding_indices = K.argmax(-distances, axis=1) encodings = K.one_hot(encoding_indices, self.num_classes) encoding_indices = K.reshape(encoding_indices, K.shape(x)[:-1]) quantized = self.quantize(encoding_indices) e_latent_loss = K.mean((K.stop_gradient(quantized) - x)**2) q_latent_loss = K.mean((quantized - K.stop_gradient(x))**2) self.add_loss(e_latent_loss + q_latent_loss * self.beta) return K.stop_gradient(quantized - x) + x
def call(self, inputs, training=None): inputs_expand = K.expand_dims(inputs, 1) inputs_tiled = K.tile(inputs_expand, [1, self.num_capsule, 1, 1]) inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W, [2, 3]), elems=inputs_tiled) inputs_hat_stopped = K.stop_gradient(inputs_hat) b = K.stop_gradient(K.sum(K.zeros_like(inputs_hat), -1)) assert self.num_routing > 0, 'The num_routing should be > 0.' for i in range(self.num_routing): c = tf.nn.softmax(b, dim=1) if i == self.num_routing - 1: outputs = squash(K.batch_dot(c, inputs_hat, [2, 2])) else: outputs = squash(K.batch_dot(c, inputs_hat_stopped, [2, 2])) b += K.batch_dot(outputs, inputs_hat_stopped, [2, 3]) return outputs
def at_loss(self, eps): # original loss loss_orig = self._loss_func(self.inputs[-1], self.outputs[0]) # gradients grads = K.stop_gradient(K.gradients(loss_orig, self.inputs[:-1]))[0] # perterbed samples new_inputs = self.inputs[:-1] + eps * K.sign(grads) # estimation for the perturbated samples outputs_perturb = self.call([new_inputs, self.inputs[-1]]) # computing losses loss = self._loss_func(self.inputs[-1], outputs_perturb) return loss
def call(self, inputs, **kwargs): outputs = [] for i, thr in enumerate(self.trans_handler): shadow = bk.stop_gradient(inputs) if self.shadow_pinned else inputs outputs.append(self.scales[i] * (shadow if thr is None else thr(shadow))) if 1 == len(outputs): return outputs[0] return bk.concatenate( outputs) if 'concat' == self.merge_type else bk.sum(outputs, axis=0)
def style_loss(style, combination, mask_path=None, nb_channels=None): assert K.ndim(style) == 3 assert K.ndim(combination) == 3 if content_mask_path is not None: content_mask = K.variable(load_mask(content_mask_path, nb_channels)) combination = combination * K.stop_gradient(content_mask) del content_mask if mask_path is not None: style_mask = K.variable(load_mask(mask_path, nb_channels)) style = style * K.stop_gradient(style_mask) if content_mask_path is None: combination = combination * K.stop_gradient(style_mask) del style_mask S = gram_matrix(style) C = gram_matrix(combination) channels = 3 size = img_width * img_height return K.sum(K.square(S - C)) / (4. * (channels**2) * (size**2))
def call(self, x): x_shape = x.get_shape().as_list() num_batches = x_shape[0] batch_list = [] for batch_idx in range(num_batches): if batch_idx in self.stop_batch_indices: batch = K.stop_gradient(x[batch_idx]) else: batch = x[batch_idx] batch_list.append(batch) return K.stack(batch_list)
def style_loss(style, combination, mask_path=None, nb_channels=None): assert K.ndim(style) == 3 assert K.ndim(combination) == 3 if content_mask_path is not None: content_mask = K.variable(load_mask(content_mask_path, nb_channels)) combination = combination * K.stop_gradient(content_mask) del content_mask if mask_path is not None: style_mask = K.variable(load_mask(mask_path, nb_channels)) style = style * K.stop_gradient(style_mask) if content_mask_path is None: combination = combination * K.stop_gradient(style_mask) del style_mask S = gram_matrix(style) C = gram_matrix(combination) channels = 3 size = img_width * img_height return K.sum(K.square(S - C)) / (4. * (channels ** 2) * (size ** 2))
def optimizer(self): action = K.placeholder(shape=[None, 5]) discounted_rewards = K.placeholder(shape=[None, ]) good_prob = K.sum(action * self.model.output, axis=1) eligibility = K.log(good_prob) * K.stop_gradient(discounted_rewards) loss = -K.sum(eligibility) optimizer = Adam(lr=self.learning_rate) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, action, discounted_rewards], [], updates=updates) return train
def compute_inputs(self, trainable_params, feats, labels): predictions = self.wrapper( [self.params, K.transpose(trainable_params), feats]) loss = K.mean(losses.get(self.wrapper.loss)(labels, predictions)) gradients = K.stop_gradient( K.squeeze(K.gradients(loss, [trainable_params]), 0)) loss = loss * K.ones_like(trainable_params) preprocessed_gradients = self.preprocess(gradients) preprocessed_loss = self.preprocess(loss) inputs = K.stop_gradient( K.concatenate([ self.param_coordinates, preprocessed_gradients[0], preprocessed_gradients[1], preprocessed_loss[0], preprocessed_loss[1], ], axis=1)) return inputs, gradients
def adv_loss(self, eps, xi, ip): """ :param eps: the epsilon (input variation parameter) :param ip: the number of iterations :param xi: the finite difference parameter """ normal_outputs = [K.stop_gradient(x) for x in _to_list(self.outputs)] d_list = [K.random_normal(K.shape(x)) for x in self.inputs] for _ in range(ip): d_list = [xi * _normalize_vector(d) for d in d_list] new_inputs = [x + d for (x, d) in zip(self.inputs, d_list)] new_outputs = _to_list(self.call(new_inputs)) klds = [K.mean(_kld(normal, new)) for normal, new in zip(normal_outputs, new_outputs)] kld = reduce(lambda t, x: t + x, klds, 0) d_list = [K.stop_gradient(d) for d in K.gradients(kld, d_list)] new_inputs = [x + eps * _normalize_vector(d) for (x, d) in zip(self.inputs, d_list)] y_perturbations = _to_list(self.call(new_inputs)) klds = [K.mean(_kld(normal, new)) for normal, new in zip(normal_outputs, y_perturbations)] kld = reduce(lambda t, x: t + x, klds, 0) return kld
def call(self, x, mask=None): input_img, feature_map = x xout = tf.py_func(binary_map_tensor_func, [input_img, feature_map], 'float32', stateful=False, name='binary_map') xout = K.stop_gradient(xout) xout.set_shape([ feature_map.shape[0], feature_map.shape[1], feature_map.shape[2], 5 ]) return xout
def call(self, inputs): _, kernel_b = xnorize(self.kernel, self.H) _, inputs_b = xnorize(inputs) outputs = K.conv2d(inputs_b, kernel_b, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) # calculate Wa and xa # kernel_a mask = K.reshape(self.kernel, (-1, self.filters)) # self.nb_row * self.nb_col * channels, filters kernel_a = K.stop_gradient(K.mean(K.abs(mask), axis=0)) # filters # inputs_a if self.data_format == 'channels_first': channel_axis = 1 else: channel_axis = -1 mask = K.mean(K.abs(inputs), axis=channel_axis, keepdims=True) ones = K.ones(self.kernel_size + (1, 1)) inputs_a = K.conv2d(mask, ones, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) # nb_sample, 1, new_nb_row, new_nb_col if self.data_format == 'channels_first': outputs = outputs * K.stop_gradient(inputs_a) * K.expand_dims(K.expand_dims(K.expand_dims(kernel_a, 0), -1), -1) else: outputs = outputs * K.stop_gradient(inputs_a) * K.expand_dims(K.expand_dims(K.expand_dims(kernel_a, 0), 0), 0) if self.use_bias: outputs = K.bias_add( outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def get_updates(self, loss, params): # pylint:disable=too-many-locals """ Get the weight updates Parameters ---------- loss: list The loss to update params: list The variables """ grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] l_r = self.lr if self.initial_decay > 0: l_r = l_r * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) var_t = K.cast(self.iterations, K.floatx()) + 1 # bias correction bias_correction1 = 1. - K.pow(self.beta_1, var_t) bias_correction2 = 1. - K.pow(self.beta_2, var_t) m_s = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] v_s = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + m_s + v_s for param, grad, var_m, var_v in zip(params, grads, m_s, v_s): if self.weight_decay != 0.: grad += self.weight_decay * K.stop_gradient(param) m_t = (self.beta_1 * var_m) + (1. - self.beta_1) * grad m_corr_t = m_t / bias_correction1 v_t = (self.beta_2 * var_v ) + (1. - self.beta_2) * K.square(grad - m_t) + self.epsilon v_corr_t = K.sqrt(v_t / bias_correction2) p_t = param - l_r * m_corr_t / (v_corr_t + self.epsilon) self.updates.append(K.update(var_m, m_t)) self.updates.append(K.update(var_v, v_t)) new_param = p_t # Apply constraints. if getattr(param, 'constraint', None) is not None: new_param = param.constraint(new_param) self.updates.append(K.update(param, new_param)) return self.updates
def loss_prior(y_true, y_pred): m = y_true log_z_self, log_z_final = y_pred[:, :, :, :1], y_pred[:, :, :, 1:] tmp_shape = K.shape(log_z_final) d = my_get_shape(log_z_final) log_z_final = K.reshape(log_z_final, [tmp_shape[0], d]) z_final = K.softmax(log_z_final, axis=1) z_final = K.stop_gradient(z_final) log_z_self = K.reshape(log_z_self, [tmp_shape[0], d]) loss = -K.sum(z_final * log_z_self, axis=1) / (m + 1.) return K.mean(loss)
def call(self, x): s, x1 = x a = x1[:, :1] s_hat = x1[:, 1:2] # Rescale the weights, making sure we mostly scale down a_hat = a * K.clip(s_hat / s, self.min_decrease, self.max_increase) # Scale again so that the reported loss is comparable to the other ones t = 1 #sT = K.transpose(s) #t = K.dot(sT, a) / K.dot(sT, a_hat) return K.stop_gradient([a_hat * t])[0]
def policy_loss_func(args): p_t, v_t, act_t, rew_t = args log_p_t = tf.nn.log_softmax(p_t) oh_t = K.one_hot(act_t, n_actions) oh_t = K.squeeze(oh_t, 1) p_oh_t = K.sum(log_p_t * oh_t, axis=-1, keepdims=True) adv_t = (rew_t - K.stop_gradient(v_t)) tf.summary.scalar("advantage_mean", K.mean(adv_t)) tf.summary.scalar("advantage_rms", K.sqrt(K.mean(K.square(adv_t)))) res_t = -adv_t * p_oh_t tf.summary.scalar("loss_policy_mean", K.mean(res_t)) tf.summary.scalar("loss_policy_rms", K.sqrt(K.mean(K.square(res_t)))) return res_t
def darknet_body(self, inputs, training=None): reg[0] = None x = DarknetConv2D_BN_Leaky(32, (3, 3), name=None, training=training)(inputs) x = self.resblock_body(x, 64, 1, name=None, training=training) C2 = x = self.resblock_body(x, 128, 2, name=None, training=training) C3 = x = self.resblock_body(x, 256, 8, name=None, training=training) C4 = x = self.resblock_body(x, 512, 8, name=None, training=training) C5 = x = self.resblock_body(x, 1024, 4, name=None, training=training) if self.mode == 'training': return KL.Lambda(lambda x: [K.stop_gradient(f) for f in x])( [C2, C3, C4, C5]) else: return [C2, C3, C4, C5]
def __update_actor__(self): action = K.placeholder(shape=(None, self.output_shape_)) advantages = K.placeholder(shape=(None, )) policy = self.actor_.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob+1e-10) * K.stop_gradient(advantages) loss = -K.sum(eligibility) gradient = RMSprop() updates = gradient.get_updates(self.actor_.trainable_weights,[],loss) train = K.function([self.actor_.input,action,advantages],[self.actor_.output],updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[ None, ]) good_prob = K.sum(action * self.actor.output, axis=1) eligibility = K.log(good_prob) * K.stop_gradient(advantages) loss = -K.sum(eligibility) optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, discounted_rewards], [], updates=updates) return train
def _step(prev_time, cur_time, output_ta_t, *states): train_phase_indices = input_ta.read(prev_time) prob = output_ta_t.read(prev_time) test_phase_indices = K.argmax(K.stop_gradient(prob), axis=-1) indices = K.in_train_phase(train_phase_indices, test_phase_indices, learning_phase) embeddings_vector = K.gather(self.embeddings, indices) inputs = K.concatenate([embeddings_vector, self.vg], axis=-1) output, new_states = step_function(inputs, tuple(states)) for state, new_state in zip(states, new_states): new_state.set_shape(state.get_shape()) output_ta_t = output_ta_t.write(cur_time, output) return (cur_time, cur_time + 1, output_ta_t) + tuple(new_states)
def adv_acc(y, _): # Generate adversarial examples y_race = tf.get_default_graph().get_tensor_by_name( "race_output_target_1:0") x_adv = fgsm(model, y_race, eps=eps, clip_min=clip_min, clip_max=clip_max) # Consider the attack to be constant x_adv = K.stop_gradient(x_adv) # Accuracy on the adversarial examples preds_age, preds_race, preds_gender = model(x_adv) return categorical_accuracy(y, preds_gender)
def build_network(self): hidden_dim = 1024 reg = lambda: L1L2(l1=1e-9, l2=1e-9) x = Input(shape=(self.data_dim,), name="x") h = x h = Dense(hidden_dim, W_regularizer=reg())(h) h = Dropout(0.5)(h) #h = BatchNormalization(mode=1)(h) h = LeakyReLU(0.2)(h) h = Dense(int(hidden_dim / 2), W_regularizer=reg())(h) h = Dropout(0.5)(h) #h = BatchNormalization(mode=1)(h) h = LeakyReLU(0.2)(h) h = Dense(int(hidden_dim / 4), W_regularizer=reg())(h) h = Dropout(0.5)(h) #h = BatchNormalization(mode=1)(h) h = LeakyReLU(0.2)(h) y = Dense(self.action_space.n, W_regularizer=reg())(h) # Q(s, a) self.Q = Model(x, y, name="Q") action = Input(shape=(1,), dtype='int32', name="action") """ selected_y = merge([y, action], mode=lambda z: K.sum(K.one_hot(K.reshape(z[1], (-1,)), K.shape(z[0])[1]) * z[0], axis=-1, keepdims=True), output_shape=lambda z: z[1]) """ selected_y = merge([y, action], mode=lambda z: K.reshape(z[0][K.arange(K.shape(z[0])[0]), K.reshape(z[1], (-1,))], (-1, 1)), output_shape=lambda z: z[1]) self.Q_s = Model([x, action], selected_y, name="Q_s") value = Lambda(lambda z: K.max(z, axis=-1, keepdims=True), output_shape=lambda z: (z[0], 1))(y) self.V = Model(x, value, name="V") x_prime = Input(shape=(self.data_dim,), name="x_prime") done = Input(shape=(1,), name="done", dtype="int32") v_prime = Lambda(lambda z: K.stop_gradient(z), output_shape=lambda z: z)(self.V(x_prime)) # v_prime = self.V(x_prime) q = self.Q_s([x, action]) r_pred = merge([q, v_prime, done], mode=lambda z: z[0] - ((1 - z[2]) * self.discount * z[1]), output_shape=lambda z: z[0]) self.training_model = Model([x, action, x_prime, done], r_pred, name="training_model") self.training_model.compile(self.optimizer, "mean_squared_error")
def domain_prediction(x, hyperpars): # x = GradientReversal(hp_lambda=hyperpars['grad_rev_lambda'])(x) x = Lambda(lambda x, _lambda=hyperpars['grad_rev_lambda']: K.stop_gradient( x * K.cast(1 + _lambda, 'float32')) - x * (K.cast(_lambda, 'float32')))(x) for (i, layer_size) in enumerate(hyperpars['domain_prediction_layers']): x = Dense(layer_size, activation='relu', name='domain_pred_layer' + str(i))(x) x = Dropout(hyperpars['prediction_dropout'])(x) x = Dense(1, activation='sigmoid')(x) x = Lambda(lambda x: K.squeeze(x, axis=1))(x) return x
def ternarize(W, H=1): '''The weights' binarization function, # References: - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902) - [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) ''' W /= H ones = K.ones_like(W) zeros = K.zeros_like(W) Wt = select(W > 0.5, ones, select(W <= -0.5, -ones, zeros)) Wt *= H return W + K.stop_gradient(Wt - W)
def fgsm(x, grad, eps=0.3, clipping=True): """ FGSM attack. """ # signed gradient normed_grad = K.sign(grad).eval() # Multiply by constant epsilon scaled_grad = eps * normed_grad # Add perturbation to original example to obtain adversarial example adv_x = K.stop_gradient(x + scaled_grad) if clipping: adv_x = K.clip(adv_x, 0, 1) return adv_x
def call(self, xin): """Tensorflow hook Args: xin (tensor): 2 image tensor Returns: optical_flow_vector : a tensor containing the results of the optical flow computation """ xout = tf.py_function(compute_optical_flow, [xin], 'float32', name='OpticalFlow') xout = K.stop_gradient(xout) # explicitly set no grad xout.set_shape([xin.shape[0], 66, 200, xin.shape[-1]]) # explicitly set output shape return xout
def actor_optimizer(self): action = K.placeholder(shape=(None, self.action_size)) advantages = K.placeholder(shape=(None, )) policy = self.actor.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages) loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) actor_loss = loss + 0.01*entropy optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) train = K.function([self.actor.input, action, advantages], [], updates=updates) return train