def tf_simhash_decompose(matrix, inner_dimension, seed=0): """Approximately decompose matrix as a product R S D. Args: matrix: the matrix to be decomposed, given as a tensorflow matrix. inner_dimension: the number of rows in S. seed: a seed for the pseudorandom matrix R. Returns: Tensorflow matrices R, S, and D, where: R is iid normal distributed with inner_dimension columns. S is a +/-1 sign matrix, and D is a diagonal matrix, """ rows, _ = matrix.get_shape().as_list() np.random.seed(seed=seed) r = tf.convert_to_tensor(value=np.random.normal(size=(rows, inner_dimension)), dtype=tf.float32) s_with_zeros = tf.math.sign(tf.linalg.matmul(r, matrix, transpose_a=True)) s = tf.compat.v1.where(tf.math.equal(s_with_zeros, tf.constant(0.)), tf.ones(tf.shape(input=s_with_zeros)), s_with_zeros) rs_column_norms = tf.norm(tensor=tf.matmul(r, s), axis=0) matrix_column_norms = tf.norm(tensor=matrix, axis=0) d = tf.linalg.tensor_diag( tf.math.divide(matrix_column_norms, rs_column_norms)) return r, s, d
def _build_train_ops(self): self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_c') self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_a') with tf.variable_scope('critic_train'): # self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars]) self.loss_c = tf.reduce_mean(tf.square( self.td_error)) # + 0.001 * self.reg_c self.optim_c = tf.train.AdamOptimizer(self.lr_c) self.grads_c = self.optim_c.compute_gradients( self.loss_c, self.critic_vars) if self.clip_norm: self.grads_c = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_c] self.train_op_c = self.optim_c.apply_gradients(self.grads_c) with tf.variable_scope('actor_train'): # self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars]) # self.entropy_a =- tf.reduce_sum(self.actor * tf.log(self.actor)) self.loss_a = tf.reduce_mean( tf.stop_gradient(self.td_error) * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.actor, labels=self.a), name='loss_actor') # + 0.001 * self.reg_a self.optim_a = tf.train.AdamOptimizer(self.lr_a) self.grads_a = self.optim_a.compute_gradients( self.loss_a, self.actor_vars) if self.clip_norm: self.grads_a = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_a] self.train_op_a = self.optim_a.apply_gradients(self.grads_a) with tf.variable_scope('summary'): self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') self.summary = [ tf.summary.scalar('loss/critic', self.loss_c), tf.summary.scalar('loss/actor', self.loss_a), tf.summary.scalar('episode_reward', self.ep_reward) ] self.summary += [ tf.summary.scalar('grads/a_' + var.name, tf.norm(grad)) for grad, var in self.grads_a if grad is not None ] self.summary += [ tf.summary.scalar('grads/c_' + var.name, tf.norm(grad)) for grad, var in self.grads_c if grad is not None ] self.merged_summary = tf.summary.merge_all( key=tf.GraphKeys.SUMMARIES) self.train_ops = [self.train_op_a, self.train_op_c] self.sess.run(tf.global_variables_initializer())
def apply_gradients(self, grads_and_vars, global_step=None, name=None): assignments = [] for (grad, param) in grads_and_vars: if grad is None or param is None: continue param_name = param.op.name v = tf.get_variable(name=param_name + "/Momentum", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) if self._use_weight_decay(param_name): grad += self.weight_decay * param if self.classic_momentum: trust_ratio = 1.0 if self._do_layer_adaptation(param_name): w_norm = tf.norm(param, ord=2) g_norm = tf.norm(grad, ord=2) trust_ratio = tf.where( tf.greater(w_norm, 0), tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0), 1.0) scaled_lr = self.learning_rate * trust_ratio next_v = tf.multiply(self.momentum, v) + scaled_lr * grad if self.use_nesterov: update = tf.multiply(self.momentum, next_v) + scaled_lr * grad else: update = next_v next_param = param - update else: next_v = tf.multiply(self.momentum, v) + grad if self.use_nesterov: update = tf.multiply(self.momentum, next_v) + grad else: update = next_v trust_ratio = 1.0 if self._do_layer_adaptation(param_name): w_norm = tf.norm(param, ord=2) v_norm = tf.norm(update, ord=2) trust_ratio = tf.where( tf.greater(w_norm, 0), tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0), 1.0) scaled_lr = trust_ratio * self.learning_rate next_param = param - scaled_lr * update assignments.extend([param.assign(next_param), v.assign(next_v)]) if global_step is not None: new_global_step = global_step + 1 assignments.append(global_step.assign(new_global_step)) return tf.group(*assignments, name=name)
def normalize_grad_fn(grads_and_vars): normalized_grads = [] for grad, var in grads_and_vars: normalized_grads += [ (grad / (tf.norm(grad) + tf.constant(1e-10)) * tf.norm(var) * self.config['alpha'], var) ] return normalized_grads
def add_compression_summaries(self): """Adds summaries of alpha value and last update step.""" with tf.name_scope(self._spec.name + '_summaries'): tf.summary.scalar('last_alpha_update_step', self._last_alpha_update_step) tf.summary.scalar(self.alpha.op.name + '/alpha', self.alpha) tf.summary.scalar(self.a_matrix_tfvar.op.name + '/a_matrix_norm', tf.norm(self.a_matrix_tfvar)) tf.summary.scalar(self.b_matrix_tfvar.op.name + '/b_matrix_norm', tf.norm(self.b_matrix_tfvar))
def image_encoder(image_feat, hparams, name="image_encoder", save_weights_to=None, make_image_summary=True): """A stack of self attention layers.""" x = image_feat with tf.variable_scope(name): for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = vqa_layers.multihead_attention( common_layers.layer_preprocess(x, hparams), None, None, hparams.attention_key_channels or hparams.image_hidden_size, hparams.attention_value_channels or hparams.image_hidden_size, hparams.image_hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=None, make_image_summary=make_image_summary, dropout_broadcast_dims=None, max_length=None, vars_3d=False, scale_otproduct=hparams.scale_dotproduct) utils.collect_named_outputs("norms", "image_feat_self_attention", tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_self_attention_zero_add", tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), hparams.image_filter_size, hparams.image_hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=None) utils.collect_named_outputs("norms", "image_feat_ffn", tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs("norms", "image_feat_ffn_zero_add", tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def question_encoder(question, question_self_attention_bias, hparams, name="question_encoder", save_weights_to=None, make_image_summary=True): """A stack of self attention layers.""" x = question with tf.variable_scope(name): for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = vqa_layers.multihead_attention( common_layers.layer_preprocess(x, hparams), None, question_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.question_self_attention_type, block_length=hparams.block_length, save_weights_to=save_weights_to, make_image_summary=make_image_summary, scale_dotproduct=hparams.scale_dotproduct, ) utils.collect_named_outputs( "norms", "query_self_attention_%d" % (layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "query_self_attention_postprocess_%d" % (layer), tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, ) utils.collect_named_outputs("norms", "query_ffn_%d" % (layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "query_ffn_postprocess_%d" % (layer), tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def pca_error(self, y, z): norm_type = self.norm_type z = tf.matmul( z , tf.transpose(self.A) ) if norm_type in ['MSE', 'mse', 'Frob', 'F']: return tf.reduce_mean(tf.square(tf.norm(y-z, ord=2, axis=1))) elif norm_type in ['L1', 'l1']: return tf.reduce_mean(tf.norm(y-z, ord=1, axis=1)) elif norm_type in ['LAD', 'lad', 'L21', 'l21', 'L2', 'l2']: return tf.reduce_mean(tf.norm(y-z, ord=2, axis=1)) else: raise Exception("Norm type error!")
def build_network(self): print("num_factor_1=%d, num_factor_2=%d, hidden_dimension=%d" % ( self.num_factor_1, self.num_factor_2, self.hidden_dimension)) # placeholder self.user_id = tf.placeholder(dtype=tf.int32, shape=[None], name='user_id') self.item_id = tf.placeholder(dtype=tf.int32, shape=[None], name='item_id') self.y = tf.placeholder("float", [None], 'rating') # Variable P = tf.Variable(tf.random_normal([self.n_users, self.num_factor_1], stddev=0.01)) Q = tf.Variable(tf.random_normal([self.n_items, self.num_factor_1], stddev=0.01)) U = tf.Variable(tf.random_normal([self.n_users, self.num_factor_2], stddev=0.01)) V = tf.Variable(tf.random_normal([self.n_items, self.num_factor_2], stddev=0.01)) # forward input = tf.concat(values=[tf.nn.embedding_lookup(P, self.user_id), tf.nn.embedding_lookup(Q, self.item_id), tf.multiply(tf.nn.embedding_lookup(U, self.user_id), tf.nn.embedding_lookup(V, self.item_id)) ], axis=1) # tf1->tf2 # regularizer = tf.contrib.layers.l2_regularizer(scale=self.reg_rate) regularizer = tf.keras.regularizers.l2(self.reg_rate) layer_1 = tf.layers.dense(inputs=input, units=2 * self.num_factor_1 + self.num_factor_2, bias_initializer=tf.random_normal_initializer, kernel_initializer=tf.random_normal_initializer, activation=tf.sigmoid, kernel_regularizer=regularizer) layer_2 = tf.layers.dense(inputs=layer_1, units=self.hidden_dimension, activation=tf.sigmoid, bias_initializer=tf.random_normal_initializer, kernel_initializer=tf.random_normal_initializer, kernel_regularizer=regularizer) layer_3 = tf.layers.dense(inputs=layer_2, units=self.hidden_dimension, activation=tf.sigmoid, bias_initializer=tf.random_normal_initializer, kernel_initializer=tf.random_normal_initializer, kernel_regularizer=regularizer) layer_4 = tf.layers.dense(inputs=layer_3, units=self.hidden_dimension, activation=tf.sigmoid, bias_initializer=tf.random_normal_initializer, kernel_initializer=tf.random_normal_initializer, kernel_regularizer=regularizer) output = tf.layers.dense(inputs=layer_4, units=1, activation=None, bias_initializer=tf.random_normal_initializer, kernel_initializer=tf.random_normal_initializer, kernel_regularizer=regularizer) self.pred_rating = tf.reshape(output, [-1]) # backward self.loss = tf.reduce_sum(tf.square(self.y - self.pred_rating)) \ + tf.losses.get_regularization_loss() + self.reg_rate * ( tf.norm(U) + tf.norm(V) + tf.norm(P) + tf.norm(Q)) self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
def add_compression_summaries(self): """Adds summaries of alpha value and last update step.""" with tf.name_scope(self._spec.name + '_summaries'): logging.info('add_compression_summaries scope name is %s', self._spec.name) tf.summary.scalar(self.alpha.op.name + '/alpha', self.alpha) tf.summary.scalar(self.a_matrix_tfvar.op.name + '/a_matrix_norm', tf.norm(self.a_matrix_tfvar)) tf.summary.scalar(self.b_matrix_tfvar.op.name + '/d_matrix_norm', tf.norm(tf.reshape(self.b_matrix_tfvar, [-1]), ord=1)) tf.summary.scalar(self.c_matrix_tfvar.op.name + '/c_matrix_norm', tf.reduce_sum(self.c_matrix_tfvar))
def compute_Fl(flow_gt, flow_est, mask): # F1 measure err = tf.multiply(flow_gt - flow_est, mask) err_norm = tf.norm(err, axis=-1) flow_gt_norm = tf.maximum(tf.norm(flow_gt, axis=-1), 1e-12) F1_logic = tf.logical_and(err_norm > 3, tf.divide(err_norm, flow_gt_norm) > 0.05) F1_logic = tf.cast(tf.logical_and(tf.expand_dims(F1_logic, -1), mask > 0), tf.float32) F1 = tf.reduce_sum(F1_logic) / (tf.reduce_sum(mask) + 1e-6) return F1
def reconstruction_loss(self, x, x_tilde): norm_type = self.loss_norm_type x = tf.reshape(x, (tf.shape(x)[0], -1)) x_tilde = tf.reshape(x_tilde, (tf.shape(x_tilde)[0], -1)) if norm_type in ['MSE', 'mse', 'Frob', 'F']: return tf.square(tf.norm(x-x_tilde, ord=2, axis=1)) elif norm_type in ['L1', 'l1']: return tf.norm(x-x_tilde, ord=1, axis=1) elif norm_type in ['LAD', 'lad', 'L21', 'l21', 'L2', 'l2']: return tf.norm(x-x_tilde, ord=2, axis=1) else: raise Exception("Norm type error!")
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_hidden_size = hp.hidden_size image_feat = common_layers.dense(image_feat, image_hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout(encoder_input, keep_prob=1. - hp.layer_prepostprocess_dropout) encoder_output = image_question_encoder(encoder_input, encoder_self_attention_bias, hp) utils.collect_named_outputs("norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size**0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout(query, keep_prob=1. - hp.layer_prepostprocess_dropout) decoder_output = decoder(query, encoder_output, None, encoder_decoder_attention_bias, hp) utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def build_graph(hub_module_url, target_image_path): # Step 1) Prepare pre-trained model for extracting image features. module = hub.Module(hub_module_url) height, width = hub.get_expected_image_size(module) # Copied a method of https://github.com/GoogleCloudPlatform/cloudml-samples/blob/bf0680726/flowers/trainer/model.py#L181 # and fixed for all type images (not only jpeg) def decode_and_resize(image_str_tensor): """Decodes jpeg string, resizes it and returns a uint8 tensor.""" image = tf.image.decode_image(image_str_tensor, channels=CHANNELS) # Note resize expects a batch_size, but tf_map supresses that index, # thus we have to expand then squeeze. Resize returns float32 in the # range [0, uint8_max] image = tf.expand_dims(image, 0) image = tf.image.resize_bilinear(image, [height, width], align_corners=False) image = tf.squeeze(image, squeeze_dims=[0]) image = tf.cast(image, dtype=tf.uint8) return image def to_img_feature(images): """Extract the feature of image vectors""" outputs = module(dict(images=images), signature="image_feature_vector", as_dict=True) return outputs['default'] # Step 2) Extract image features of the target image. target_image_bytes = tf.gfile.GFile(target_image_path, 'rb').read() target_image = tf.constant(target_image_bytes, dtype=tf.string) target_image = decode_and_resize(target_image) target_image = tf.image.convert_image_dtype(target_image, dtype=tf.float32) target_image = tf.expand_dims(target_image, 0) target_image = to_img_feature(target_image) # Step 3) Extract image features of input images. input_byte = tf.placeholder(tf.string, shape=[None]) input_image = tf.map_fn(decode_and_resize, input_byte, back_prop=False, dtype=tf.uint8) input_image = tf.image.convert_image_dtype(input_image, dtype=tf.float32) input_image = to_img_feature(input_image) # Step 4) Compare cosine_similarities of the target image and the input images. dot = tf.tensordot(target_image, tf.transpose(input_image), 1) similarity = dot / (tf.norm(target_image, axis=1) * tf.norm(input_image, axis=1)) similarity = tf.reshape(similarity, [-1]) return input_byte, similarity
def gradient_panalty(self, real, fake, scope="discriminator_A"): if self.gan_type.__contains__('dragan'): eps = tf.random_uniform(shape=tf.shape(real), minval=0., maxval=1.) _, x_var = tf.nn.moments(real, axes=[0, 1, 2, 3]) x_std = tf.sqrt( x_var) # magnitude of noise decides the size of local region fake = real + 0.5 * x_std * eps alpha = tf.random_uniform(shape=[self.batch_size, 1, 1, 1], minval=0., maxval=1.) interpolated = real + alpha * (fake - real) logit, cam_logit, _, _ = self.discriminator(interpolated, reuse=True, scope=scope) GP = [] cam_GP = [] for i in range(2): grad = tf.gradients(logit[i], interpolated)[0] # gradient of D(interpolated) grad_norm = tf.norm(flatten(grad), axis=1) # l2 norm # WGAN - LP if self.gan_type == 'wgan-lp': GP.append( self.ld * tf.reduce_mean(tf.square(tf.maximum(0.0, grad_norm - 1.)))) elif self.gan_type == 'wgan-gp' or self.gan_type == 'dragan': GP.append(self.ld * tf.reduce_mean(tf.square(grad_norm - 1.))) for i in range(2): grad = tf.gradients(cam_logit[i], interpolated)[0] # gradient of D(interpolated) grad_norm = tf.norm(flatten(grad), axis=1) # l2 norm # WGAN - LP if self.gan_type == 'wgan-lp': cam_GP.append( self.ld * tf.reduce_mean(tf.square(tf.maximum(0.0, grad_norm - 1.)))) elif self.gan_type == 'wgan-gp' or self.gan_type == 'dragan': cam_GP.append(self.ld * tf.reduce_mean(tf.square(grad_norm - 1.))) return sum(GP), sum(cam_GP)
def create(self): vectors = tf.get_variable('unorthogonal_rotation', [self.dim, self.dim], dtype=tf.float32) # add batch dimension for matmul basis = tf.expand_dims(vectors[0, :] / tf.norm(vectors[0, :]), 0) for i in range(1, vectors.get_shape()[0].value): v = vectors[i, :] # add batch dimension for matmul v = tf.expand_dims(v, 0) w = v - tf.matmul(tf.matmul(v, basis, transpose_b=True), basis) # I assume that my matrix is close to orthogonal basis = tf.concat([basis, w / tf.norm(w)], axis=0) return basis
def _resource_apply_dense(self, grad, var): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = tf.cast(beta1_power, var.dtype.base_dtype) beta2_power = tf.cast(beta2_power, var.dtype.base_dtype) lr_t = tf.cast(self._lr_t, var.dtype.base_dtype) beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = tf.cast(self._epsilon_t, var.dtype.base_dtype) weight_decay_rate_t = tf.cast(self._weight_decay_rate_t, var.dtype.base_dtype) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = m * beta1_t + m_scaled_g_values m_t = tf.assign(m, m_t, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = v * beta2_t + v_scaled_g_values v_t = tf.assign(v, v_t, use_locking=self._use_locking) # ==== The following is with m_t_hat and v_t_hat m_t_hat = m_t / (1. - beta1_power) v_t_hat = v_t / (1. - beta2_power) v_sqrt = tf.sqrt(v_t_hat) update = m_t_hat / (v_sqrt + epsilon_t) # ==== The following is the original LAMBOptimizer implementation # v_sqrt = tf.sqrt(v_t_hat) # update = m_t / (v_sqrt + epsilon_t) var_name = self._get_variable_name(var.name) if self._do_use_weight_decay(var_name): update += weight_decay_rate_t * var ratio = 1.0 if self._do_layer_adaptation(var_name): if var.shape.ndims > 1 and var.shape[0] == 24: w_norm = tf.norm(var, 2, range(1, var.shape.ndims), True) g_norm = tf.norm(update, 2, range(1, var.shape.ndims), True) else: w_norm = tf.norm(var, ord=2) g_norm = tf.norm(update, ord=2) ratio = tf.where( tf.greater(w_norm, 0), tf.where(tf.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) var_update = var - ratio * lr_t * update return tf.assign(var, var_update, use_locking=self._use_locking).op
def _apply_sparse_shared(self, grad, var, indices, scatter_add): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = tf.cast(beta1_power, var.dtype.base_dtype) beta2_power = tf.cast(beta2_power, var.dtype.base_dtype) lr_t = tf.cast(self._lr_t, var.dtype.base_dtype) beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = tf.cast(self._epsilon_t, var.dtype.base_dtype) weight_decay_rate_t = tf.cast(self._weight_decay_rate_t, var.dtype.base_dtype) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = tf.assign(m, m * beta1_t, use_locking=self._use_locking) with tf.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = tf.assign(v, v * beta2_t, use_locking=self._use_locking) with tf.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) # ==== The following is with m_t_hat and v_t_hat m_t_hat = m_t / (1. - beta1_power) v_t_hat = v_t / (1. - beta2_power) v_sqrt = tf.sqrt(v_t_hat) update = m_t_hat / (v_sqrt + epsilon_t) # ==== The following is the original LAMBOptimizer implementation # v_sqrt = tf.sqrt(v_t_hat) # update = m_t / (v_sqrt + epsilon_t) var_name = self._get_variable_name(var.name) if self._do_use_weight_decay(var_name): update += weight_decay_rate_t * var ratio = 1.0 if self._do_layer_adaptation(var_name): w_norm = tf.norm(var, ord=2) g_norm = tf.norm(update, ord=2) ratio = tf.where( tf.greater(w_norm, 0), tf.where(tf.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) var_update = tf.assign_sub(var, ratio * lr_t * update, use_locking=self._use_locking) return tf.group(*[var_update, m_t, v_t])
def pgd(model_fn, inputs, optimizer=None, layer_name='word_embeddings', epsilon=0.05, n_loop=2): with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): model_outputs = model_fn(inputs, True) grads_and_vars = utils.compute_gradients(model_outputs['loss'], optimizer) acc_r = 0.0 attack_op = tf.no_op() for k in range(n_loop): with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE), tf.control_dependencies( [attack_op]): adv_outputs = model_fn(inputs, True) attack_grad_and_vars = utils.compute_gradients( adv_outputs['loss'], optimizer) embedding_gradients, embeddings = utils.find_grad_and_var( attack_grad_and_vars, layer_name) tmp_r = tf.multiply( 1 / n_loop, embedding_gradients / (tf.norm(embedding_gradients) + 1e-9)) norm = tf.norm(acc_r + tmp_r) cur_r = tf.cond(norm > epsilon, lambda: (acc_r + tmp_r) * tf.divide(epsilon, norm), lambda: (acc_r + tmp_r)) r = cur_r - acc_r # calculate current step attack_op = embeddings.assign(embeddings + r) acc_r = cur_r # restore with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE), tf.control_dependencies( [attack_op]): attack_outputs = model_fn(inputs, True) attack_grad_and_vars = utils.compute_gradients(attack_outputs['loss'], optimizer) embedding_gradients, embeddings = utils.find_grad_and_var( attack_grad_and_vars, layer_name) restore_op = embeddings.assign(embeddings - acc_r) # sum up with tf.control_dependencies([restore_op]): grads_and_vars = utils.average_grads_and_vars( [grads_and_vars, attack_grad_and_vars]) return AdversarialOutput(model_outputs, grads_and_vars)
def learning(ACS,target_input,Rx, Ry,sess, ACS_dim_X, ACS_dim_Y, ACS_dim_Z, target_dim_X,target_dim_Y,target_dim_Z, target, kernel_x_1, kernel_x_2, kernel_y_1, kernel_y_2, layer1_channels, layer2_channels, kernel_last_x, kernel_last_y, LearningRate, MaxIteration): [target_dim0,target_dim1,target_dim2,target_dim3] = np.shape(target) input_ACS = tf.placeholder(tf.float32, [1, ACS_dim_X,ACS_dim_Y,ACS_dim_Z]) input_Target = tf.placeholder(tf.float32, [1, target_dim_X,target_dim_Y,target_dim3]) Input = tf.reshape(input_ACS, [1, ACS_dim_X, ACS_dim_Y, ACS_dim_Z]) W_conv1 = weight_variable([kernel_x_1, kernel_y_1, ACS_dim_Z, layer1_channels],'W1') #h_conv1 = tf.nn.relu(conv2d_dilate(Input, W_conv1,accrate_input)) h_conv1 = conv2d_dilate(Input, W_conv1, Rx, Ry) W_conv2 = weight_variable([kernel_x_2, kernel_y_2, layer1_channels, layer2_channels],'W2') h_conv2 = tf.nn.relu(conv2d_dilate(h_conv1, W_conv2, Rx, Ry)) W_conv3 = weight_variable([kernel_last_x, kernel_last_y, layer2_channels, target_dim3],'W3') h_conv3 = conv2d_dilate(h_conv2, W_conv3, Rx, Ry) #error_norm = tf.norm(input_Target - h_conv3) #error_norm = (tf.norm(input_Target - h_conv3, ord=2) + tf.norm(input_Target - h_conv3, ord=1))*0.5 error_norm = (tf.norm(input_Target - h_conv3, ord=2) + tf.norm(input_Target - h_conv3, ord=1))*0.5 + 0.2*(tf.nn.l2_loss(W_conv1)+0.9*tf.nn.l2_loss(W_conv2)+0.8*tf.nn.l2_loss(W_conv3)) global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay(LearningRate, global_step=global_step,decay_steps=50,decay_rate=0.95) train_step = tf.train.AdamOptimizer(lr).minimize(error_norm) #train_step = tf.train.AdamOptimizer(LearningRate).minimize(error_norm) if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1: init = tf.initialize_all_variables() else: init = tf.global_variables_initializer() sess.run(init) error_prev = 1 for i in range(MaxIteration+1): sess.run(train_step, feed_dict={input_ACS: ACS, input_Target: target, global_step:i}) if i % 100 == 0: error_now=sess.run(error_norm,feed_dict={input_ACS: ACS, input_Target: target}) print('The',i,'th iteration gives an error',error_now) #error = sess.run(error_norm,feed_dict={input_ACS: ACS, input_Target: target}) return sess.run([W_conv1,W_conv2,W_conv3])
def compute_x(self, param_name, param, m, prev_w_norm, prev_eta, prev_beta): """Compute prev x value on the fly. Alternatively, we can store this as a slot but that would double the memory usage of our parameters. We don't like that! Args: param_name: Name of the parameter. Used to check whether to normalize the gradients for this layer. param: The parameter `Tensor`. m: Accumulated momentum `Tensor` of shape same as param. prev_w_norm: Scalar tracking norm of the param tensor at previous iteration. prev_eta: Scalar tracking the learning rate applied at previous iteration. prev_beta: Scalar tracking momentum applied at previous iteration. Returns: x: An intermediate `Tensor` of shape same as param. Will be used for the final update. """ prev_ratio = 1.0 if self._do_layer_adaptation(param_name): prev_g_norm = tf.norm(m, ord=2) prev_ratio = self.gamma * tf.where( tf.math.greater(prev_w_norm, 0), tf.where(tf.math.greater(prev_g_norm, 0), (prev_w_norm / prev_g_norm), 1.0), 1.0) prev_normalized_m_with_lr = prev_ratio * prev_eta * m x = param - tf.divide( tf.multiply(prev_beta, prev_normalized_m_with_lr), prev_beta - 1.0) return x
def _apply_dense(self, grad, var): # We actually apply grads in _finish. This function is used # to record intermediate variables related to the individual gradients # which we eventually combine in _finish to obtain global statistics # (e.g. the L1 norm of the full gradient). self.grads[var] = grad betting_fraction = self.get_slot(var, OUTER_BETTING_FRACTION) self.betting_fraction_dot_product_deltas[var] = tf.reduce_sum( betting_fraction * grad) # Wealth increases by -g \cdot w where w is the parameter value. # Since w = Wealth * v with betting fraction v, we can write # the wealth increment as -(g \cdot v) Wealth. # TODO(cutkosky): at one point there was a bug in which epsilon # was not added here. It seemed performance may have degraded # somewhat after fixing this. Find out why this would be. wealth_delta = -self.betting_fraction_dot_product_deltas[ var] * self._get_non_slot(OUTER_WEALTH) self.wealth_deltas[var] = wealth_delta self.grad_norms[var] = tf.norm(grad, 1) return tf.no_op()
def dense_weightnorm(name, x, n_out, x_mask, init_scale, init, dtype=tf.float32): """Dense layer with weight normalization.""" n_in = common_layers.shape_list(x)[2] eps = tf.keras.backend.epsilon() with tf.variable_scope(name, reuse=tf.AUTO_REUSE): v = tf.get_variable("v", [n_in, n_out], dtype, initializer=tf.random_normal_initializer(0, 0.05), trainable=True) v = v / tf.norm(v, axis=0, keepdims=True) t = tf.matmul(x, v) # [B, L, n_out] mean, var = moments_over_bl(t, x_mask) g_init = init_scale / (tf.sqrt(var) + eps) g = get_variable_ddi("g", [n_out], g_init, init, initializer=tf.zeros_initializer, dtype=dtype, trainable=True) b = get_variable_ddi("b", [n_out], -mean * g_init, init, initializer=tf.zeros_initializer, dtype=dtype, trainable=True) w = g * v y = tf.matmul(x, w) + b tf.summary.histogram("_g", g) return y
def sobel_edges(images): """Computes edge intensity of image using sobel operator.""" batch_size, h, w, _ = images.shape.as_list() edges = tf.image.sobel_edges(tf.image.rgb_to_grayscale(images)) edges = tf.reshape(edges, (batch_size, h, w, 2)) edge_intensity = tf.norm(edges, ord="euclidean", axis=-1) return edge_intensity
def compute_lr(self, grad, var): scaled_lr = self._learning_rate if self._skip_list is None or not any( v in var.name for v in self._skip_list): w_norm = tf.norm(var, ord=2) g_norm = tf.norm(grad, ord=2) trust_ratio = tf.where( tf.math.greater(w_norm, 0), tf.where(tf.math.greater(g_norm, 0), (self._eeta * w_norm / (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0), 1.0) scaled_lr = self._learning_rate * trust_ratio # Add the weight regularization gradient grad = grad + self._weight_decay * var return scaled_lr, grad
def __call__(self, codes): """Uses codebook to find nearest neighbor for each code. Args: codes: A `float`-like `Tensor` containing the latent vectors to be compared to the codebook. These are rank-3 with shape `[batch_size, latent_size, code_size]`. Returns: nearest_codebook_entries: The 1-nearest neighbor in Euclidean distance for each code in the batch. one_hot_assignments: The one-hot vectors corresponding to the matched codebook entry for each code in the batch. """ distances = tf.norm( tensor=tf.expand_dims(codes, 2) - tf.reshape(self.codebook, [1, 1, self.num_codes, self.code_size]), axis=3) assignments = tf.argmin(input=distances, axis=2) one_hot_assignments = tf.one_hot(assignments, depth=self.num_codes) nearest_codebook_entries = tf.reduce_sum( input_tensor=tf.expand_dims(one_hot_assignments, -1) * tf.reshape(self.codebook, [1, 1, self.num_codes, self.code_size]), axis=2) return nearest_codebook_entries, one_hot_assignments
def test_use_resolution(self, is_training, use_resolution): config = dram_config.get_config() image_shape = (28, 28, 1) batch_size = 5 output_dims = 10 config.glimpse_model_config.output_dims = output_dims config.glimpse_model_config.glimpse_shape = config.glimpse_shape config.glimpse_model_config.num_resolutions = config.num_resolutions config.glimpse_model_config.glimpse_shape = (8, 8) config.glimpse_model_config.num_resolutions = 3 locations = tf.placeholder(shape=(batch_size, 2), dtype=tf.float32) model = glimpse_model.GlimpseNetwork(config.glimpse_model_config) images = tf.random_uniform(minval=-1, maxval=1, shape=(batch_size, ) + image_shape, dtype=tf.float32) locations = tf.zeros(shape=(batch_size, 2), dtype=tf.float32) model = glimpse_model.GlimpseNetwork(config.glimpse_model_config) g, endpoints = model(images, locations, is_training=is_training, use_resolution=use_resolution) gnorms = [ tf.norm(grad) for grad in tf.gradients(g[:, 0], endpoints["model_input_list"]) ] self.evaluate(tf.global_variables_initializer()) gnorms = self.evaluate(gnorms) for use, gnorm in zip(use_resolution, gnorms): if use: self.assertGreater(gnorm, 0.) else: self.assertEqual(gnorm, 0.)
def rodrigues(r): """ Rodrigues' rotation formula that turns axis-angle tensor into rotation matrix in a batch-ed manner. Parameter: ---------- r: Axis-angle rotation tensor of shape [batch_size, 1, 3]. Return: ------- Rotation matrix of shape [batch_size, 3, 3]. """ theta = tf.norm(r + tf.random_normal(r.shape, 0, 1e-8, dtype=tf.float64), axis=(1, 2), keepdims=True) # avoid divide by zero r_hat = r / theta cos = tf.cos(theta) z_stick = tf.zeros(theta.get_shape().as_list()[0], dtype=tf.float64) m = tf.stack( (z_stick, -r_hat[:, 0, 2], r_hat[:, 0, 1], r_hat[:, 0, 2], z_stick, -r_hat[:, 0, 0], -r_hat[:, 0, 1], r_hat[:, 0, 0], z_stick), axis=1) m = tf.reshape(m, (-1, 3, 3)) i_cube = tf.expand_dims(tf.eye(3, dtype=tf.float64), axis=0) + tf.zeros( (theta.get_shape().as_list()[0], 3, 3), dtype=tf.float64) A = tf.transpose(r_hat, (0, 2, 1)) B = r_hat dot = tf.matmul(A, B) R = cos * i_cube + (1 - cos) * dot + tf.sin(theta) * m return R
def obs_cost_fn(obs): ''' state 0:2 relative_angle 3:5 angular velocity 6:8 relative_position 9:11 velocity 12:14 acceleration ''' w_alt, w_dist, w_ang = 0.0, 0.8, 0.1 # define altitude cost alt_cost = tf.abs(obs[:, 8]) alt_cost = tf.math.tanh(0.05 * alt_cost, name=None) #value~-0.3 # define distance cost, temporarily disabled dist_cost = obs[:, 6:9] dist_cost = tf.norm(dist_cost, ord='euclidean', axis=1, name=None) dist_cost = tf.math.tanh(0.05 * dist_cost, name=None) #value~-0.3 # define angle cost ang_cost = obs[:, 0:3] ang_cost = tf.math.reduce_mean(tf.abs(ang_cost), axis=1) ang_cost = tf.math.tanh(ang_cost, name=None) #value~-0.8 #plotter # dist_cost = tf.Print(dist_cost,[dist_cost],message="This is dist_cost: ") return w_alt * alt_cost + w_dist * dist_cost + w_ang * ang_cost
def loss_fn(flo_preds, flo_gt): # Use multi-scale loss, as described in Sec. 3 in the original paper. flo_losses = 0. for flo_pred, weight in zip(flo_preds, FLAGS.losses_weight): _, gt_height, _, _ = tf.unstack(tf.shape(flo_gt)) _, pred_height, _, _ = tf.unstack(tf.shape(flo_pred)) scaled_flow_gt = tf.image.resize(flo_gt, tf.shape(flo_pred)[1:3], method=tf.image.ResizeMethod.BILINEAR) scaled_flow_gt /= tf.cast(gt_height / pred_height, dtype=tf.float32) l2_norm = tf.norm(flo_pred - scaled_flow_gt, ord=2, axis=3) flo_loss = tf.reduce_mean(tf.reduce_sum(l2_norm, axis=(1, 2))) flo_losses += flo_loss * weight # Calculate the L2 norm to regularize. l2_losses = [ FLAGS.gamma * tf.nn.l2_loss(v) for v in tf.trainable_variables() ] l2_losses = tf.reduce_sum(l2_losses) total_losses = flo_losses + l2_losses return total_losses