def rothk_penalty(self, d_real, d_fake): config = self.config g_sample = self.gan.uniform_sample x = self.gan.inputs.x gradx = tf.gradients(d_real, [x])[0] gradg = tf.gradients(d_fake, [g_sample])[0] gradx = tf.reshape(gradx, [self.ops.shape(gradx)[0], -1]) gradg = tf.reshape(gradg, [self.ops.shape(gradg)[0], -1]) gradx_norm = tf.norm(gradx, axis=1, keep_dims=True) gradg_norm = tf.norm(gradg, axis=1, keep_dims=True) if int(gradx_norm.get_shape()[0]) != int(d_real.get_shape()[0]): print("Condensing along batch for rothk") gradx_norm = tf.reduce_mean(gradx_norm, axis=0) gradg_norm = tf.reduce_mean(gradg_norm, axis=0) gradx = tf.square(gradx_norm) * tf.square(1-tf.nn.sigmoid(d_real)) gradg = tf.square(gradg_norm) * tf.square(tf.nn.sigmoid(d_fake)) loss = gradx + gradg loss *= config.rothk_lambda or 1 if config.rothk_decay: decay_function = config.decay_function or tf.train.exponential_decay decay_steps = config.decay_steps or 50000 decay_rate = config.decay_rate or 0.9 decay_staircase = config.decay_staircase or False global_step = tf.train.get_global_step() loss = decay_function(loss, global_step, decay_steps, decay_rate, decay_staircase) return loss
def p_norm(tensor,order): if type(order) in [int,float]: return tf.norm(tensor,ord=order) elif type(order) in [list,tuple]: return [tf.norm(tensor,ord=order_item) for order_item in order] else: raise ValueError('Unrecognized order of p_norm: %s'%str(order))
def cosineface_losses(embedding, labels, out_num, w_init=None, s=30., m=0.4): ''' :param embedding: the input embedding vectors :param labels: the input labels, the shape should be eg: (batch_size, 1) :param s: scalar value, default is 30 :param out_num: output class num :param m: the margin value, default is 0.4 :return: the final cacualted output, this output is send into the tf.nn.softmax directly ''' with tf.variable_scope('cosineface_loss'): # inputs and weights norm embedding_norm = tf.norm(embedding, axis=1, keep_dims=True) embedding = tf.div(embedding, embedding_norm, name='norm_embedding') weights = tf.get_variable(name='embedding_weights', shape=(embedding.get_shape().as_list()[-1], out_num), initializer=w_init, dtype=tf.float32) weights_norm = tf.norm(weights, axis=0, keep_dims=True) weights = tf.div(weights, weights_norm, name='norm_weights') # cos_theta - m cos_t = tf.matmul(embedding, weights, name='cos_t') cos_t_m = tf.subtract(cos_t, m, name='cos_t_m') mask = tf.one_hot(labels, depth=out_num, name='one_hot_mask') inv_mask = tf.subtract(1., mask, name='inverse_mask') output = tf.add(s * tf.multiply(cos_t, inv_mask), s * tf.multiply(cos_t_m, mask), name='cosineface_loss_output') return output
def find_best_k(X, Z): best_k = 1 best_valid_loss = float("inf") for k in [1, 3, 5, 50]: sess = tf.InteractiveSession() dist = calculate_euclidean_distance(X, Z) # print(sess.run(dist, feed_dict={X: trainData, Z: testData})) r = calculate_responsibilities(dist, k=k) prediction = tf.matmul(r, casted_train_target) train_losses = tf.norm(trainTarget - prediction) valid_losses = tf.norm(validTarget - prediction) valid_losses = sess.run(valid_losses, feed_dict={X: trainData, Z: validData}) test_losses = tf.norm(testTarget - prediction) print("Training/Validation/Testing loss for k={:d} is {:f}/{:f}/{:f}" .format(k, sess.run(train_losses, feed_dict={X: trainData, Z: trainData}), valid_losses, sess.run(test_losses, feed_dict={X: trainData, Z: testData}))) if valid_losses < best_valid_loss: best_k = k best_valid_loss = valid_losses return best_k, best_valid_loss
def tf_summary(self): tf.summary.scalar('cost', self.cost) tf.summary.scalar('w_fnorm', tf.norm(self.W, ord='euclidean', axis=[-2,-1])) # Frobenius Norm tf.summary.scalar('b_1norm', tf.norm(self.b, ord=1)) tf.summary.scalar('b_2norm', tf.norm(self.b, ord=2)) self.summary = tf.summary.merge_all() # for saving in the epoch/iteration self.sw = tf.summary.FileWriter(self.result_dir, self.sess.graph)
def _l1_loss(self, hparams): l1_loss = tf.zeros([1], dtype=tf.float32) # embedding_layer l2 loss for param in self.embed_params: l1_loss = tf.add(l1_loss, tf.multiply(hparams.embed_l1, tf.norm(param, ord=1))) params = self.layer_params for param in params: l1_loss = tf.add(l1_loss, tf.multiply(hparams.layer_l1, tf.norm(param, ord=1))) return l1_loss
def apply_gradients(self, grads_and_vars, global_step=None, name=None): var_list = [ v for _,v in grads_and_vars] with ops.init_scope(): zt = [self._get_or_make_slot(v, v, "zt", self._name) for _,v in grads_and_vars] slots_list = [] for name in self.optimizer.get_slot_names(): for var in self.optimizer.variables(): self._get_or_make_slot(var, var, "zt", "zt") self._prepare() def _name(post, s): ss = s.split(":") return ss[0] + "_" + post + "_dontsave" zt = [self.get_slot(v, "zt") for _,v in grads_and_vars] xt = [tf.Variable(v, name=_name("gigaxt",v.name)) for _,v in grads_and_vars] tmp = [tf.Variable(v, name=_name("gigatmp",v.name)) for _,v in grads_and_vars] xslots_list = [] zslots_list = [] tmpslots_list = [] slots_vars = [] for name in self.optimizer.get_slot_names(): for var in self.optimizer.variables(): slots_vars += [var] xslots_list.append(tf.Variable(var)) zslots_list.append(self._get_or_make_slot(var, var, "zt", "zt")) tmpslots_list.append(tf.Variable(var, name=_name("gigaslottmp", var.name))) restored_vars = var_list + slots_vars zt_vars = zt + zslots_list xt_vars = xt + xslots_list tmp_vars = tmp + tmpslots_list all_grads = [ g for g, _ in grads_and_vars ] # store variables for resetting op1 = tf.group(*[tf.assign(w, v) for w,v in zip(tmp_vars, restored_vars)]) # store tmp_vars with tf.get_default_graph().control_dependencies([op1]): op2 = self.optimizer.apply_gradients(grads_and_vars.copy(), global_step=global_step, name=name) with tf.get_default_graph().control_dependencies([op2]): op3 = tf.group(*[tf.assign(w, v) for w,v in zip(xt_vars, restored_vars)]) # store xt^+1 in xt_vars with tf.get_default_graph().control_dependencies([op3]): op4 = tf.group(*[tf.assign(w, v) for w,v in zip(restored_vars, zt_vars)]) # restore vars to zt (different weights) with tf.get_default_graph().control_dependencies([op4]): op5 = self.optimizer2.apply_gradients(grads_and_vars.copy(), global_step=global_step, name=name) # zt+1 with tf.get_default_graph().control_dependencies([op5]): zt1_xt1 = [_restored_vars - _xt1_vars for _restored_vars, _xt1_vars in zip(restored_vars, xt_vars)] St1 = [tf.minimum(1.0, tf.norm(_zt1_vars-_zt_vars) / tf.norm(_zt1_xt1)) for _zt1_vars, _zt_vars, _zt1_xt1 in zip(restored_vars, zt_vars, zt1_xt1)] self.gan.add_metric('st1',tf.reduce_mean(tf.add_n(St1)/len(St1))) #self.gan.add_metric('xzt1',tf.norm(xt_vars[0]-zt_vars[0])) nextw = [_xt_t1 + _St1 * _zt1_xt1 for _xt_t1, _St1, _zt1_xt1 in zip(xt_vars, St1, zt1_xt1)] op6 = tf.group(*[tf.assign(w, v) for w,v in zip(zt_vars, restored_vars)]) # set zt+1 with tf.get_default_graph().control_dependencies([op6]): op7 = tf.group(*[tf.assign(w, v) for w,v in zip(restored_vars, nextw)]) # set xt+1 with tf.get_default_graph().control_dependencies([op7]): return tf.no_op()
def s_norm(tensor,order): s,U,V=tf.svd(tensor,full_matrices=False) result=None if type(order) in [int,float]: result=tf.norm(s,ord=order) elif type(order) in [list,tuple]: result=[tf.norm(s,ord=order_item) for order_item in order] else: raise ValueError('Unrecognized order of s_norm: %s'%str(order)) return s,result
def __tensor_norm__(self,tensor,order): if order in ['Si']: # Schatten inf norm s,U,V=tf.svd(tensor,full_matrices=False) return tf.norm(s,ord=np.inf) elif order[0]=='S': # Schatten norm s,U,V=tf.svd(tensor,full_matrices=False) sub_order=int(order[1:]) return tf.norm(s,ord=sub_order) else: sub_order=int(order) return tf.norm(tensor,ord=sub_order)
def image_encoder(image_feat, hparams, name="image_encoder", save_weights_to=None, make_image_summary=True): """A stack of self attention layers.""" x = image_feat image_hidden_size = hparams.image_hidden_size or hparams.hidden_size image_filter_size = hparams.image_filter_size or hparams.filter_size with tf.variable_scope(name): for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = vqa_layers.multihead_attention( common_layers.layer_preprocess(x, hparams), None, None, hparams.attention_key_channels or image_hidden_size, hparams.attention_value_channels or image_hidden_size, image_hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.image_self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, scale_dotproduct=hparams.scale_dotproduct, ) utils.collect_named_outputs( "norms", "image_feat_self_attention_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_self_attention_postprocess_%d"%(layer), tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), image_filter_size, image_hidden_size, dropout=hparams.relu_dropout, ) utils.collect_named_outputs( "norms", "image_feat_ffn_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_ffn_postprocess_%d"%(layer), tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_feat = common_layers.dense(image_feat, hp.hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout( encoder_input, keep_prob=1.-hp.layer_prepostprocess_dropout) encoder_output, _ = recurrent_transformer_decoder( encoder_input, None, encoder_self_attention_bias, None, hp, name="encoder") utils.collect_named_outputs( "norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size **0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout( query, keep_prob=1.-hp.layer_prepostprocess_dropout) decoder_output, _ = recurrent_transformer_decoder( query, encoder_output, None, encoder_decoder_attention_bias, hp, name="decoder") utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def project_gradient_layer(gs): if self.config.norm == 'softmax': return tf.nn.softmax(gs) elif self.config.norm == 'euclidean': return gs / (tf.sqrt(tf.reduce_sum(tf.square(gs)))+1e-8) elif self.config.norm == 'inf': return gs / (tf.norm(gs, ord=np.inf)+1e-8) elif self.config.norm == 'max': return gs / (tf.reduce_max(tf.abs(gs))+1e-8) elif self.config.norm == False: return gs else: return gs / (tf.norm(gs, ord=self.config.norm)+1e-8)
def _cross_l_loss(self): """Construct L1-norm and L2-norm on cross network parameters for loss function. Returns: obj: Regular loss value on cross network parameters. """ cross_l_loss = tf.zeros([1], dtype=tf.float32) for param in self.cross_params: cross_l_loss = tf.add( cross_l_loss, tf.multiply(self.hparams.cross_l1, tf.norm(param, ord=1)) ) cross_l_loss = tf.add( cross_l_loss, tf.multiply(self.hparams.cross_l2, tf.norm(param, ord=2)) ) return cross_l_loss
def nearest(x, means, hparams): """Find the nearest means to elements in x.""" x, means = tf.stop_gradient(x), tf.stop_gradient(means) x_flat = tf.reshape(x, [-1, hparams.hidden_size]) x_norm = tf.norm(x_flat, axis=-1, keep_dims=True) means_norm = tf.norm(means, axis=-1, keep_dims=True) dist = x_norm + tf.transpose(means_norm) - 2 * tf.matmul(x_flat, means, transpose_b=True) _, nearest_idx = tf.nn.top_k(- dist, k=1) nearest_hot = tf.one_hot(tf.squeeze(nearest_idx, axis=1), hparams.v_size) shape = common_layers.shape_list(x) shape[-1] = hparams.v_size nearest_hot = tf.reshape(nearest_hot, shape=shape) return tf.stop_gradient(nearest_hot)
def _make_activity_op(self, input_tensor): """ Creates the op for calculating the activity of a SOM :param input_tensor: A tensor to calculate the activity of. Must be of shape `[batch_size, dim]` where `dim` is the dimensionality of the SOM's weights. :return A handle to the newly created activity op: """ with self._graph.as_default(): with tf.name_scope("Activity"): # This constant controls the width of the gaussian. # The closer to 0 it is, the wider it is. c = tf.constant(self._c, dtype="float32") # Get the euclidean distance between each neuron and the input vectors dist = tf.norm(tf.subtract( tf.expand_dims(self._weights, axis=0), tf.expand_dims(input_tensor, axis=1)), name="Distance") # [batch_size, neurons] # Calculate the Gaussian of the activity. Units with distances closer to 0 will have activities # closer to 1. activity = tf.exp(tf.multiply(tf.pow(dist, 2), c), name="Gaussian") # Convert the activity into a softmax probability distribution if self._softmax_activity: activity = tf.divide(tf.exp(activity), tf.expand_dims(tf.reduce_sum(tf.exp(activity), axis=1), axis=-1), name="Softmax") return tf.identity(activity, name="Output")
def dia(model, config, scope, connectsegment, connectfeature): with tf.variable_scope(scope), tf.name_scope(scope): with tf.variable_scope('inputs'), tf.name_scope('inputs'): model['%s_in0length_segment' %scope] = model['%s_out0length' %connectsegment] model['%s_in1length_segment' %scope] = model['%s_out1length' %connectsegment] model['%s_in2length_segment' %scope] = model['%s_out2length' %connectsegment] model['%s_maxin2length_segment' %scope] = model['%s_maxout2length' %connectsegment] model['%s_in0length_feature' %scope] = model['%s_out0length' %connectfeature] model['%s_in1length_feature' %scope] = model['%s_out1length' %connectfeature] model['%s_in2length_feature' %scope] = model['%s_out2length' %connectfeature] model['%s_maxin2length_feature' %scope] = model['%s_maxout2length' %connectfeature] model['%s_inputs_segment' %scope] = tf.squeeze(model['%s_outputs' %connectsegment], 2, '%s_inputs_segment' %scope) model['%s_inputs_feature' %scope] = tf.unstack(tf.transpose(model['%s_outputs' %connectfeature], [1, 0, 2]), name = '%s_inputs_feature' %scope) model['%s_out0length' %scope] = model['%s_in0length_feature' %scope] model['%s_out1length' %scope] = config.getint('global', 'speaker_size') model['%s_out2length' %scope] = tf.stack([config.getint('global', 'speaker_size') for _ in xrange(model['%s_out0length' %scope])]) model['%s_maxout2length' %scope] = config.getint('global', 'speaker_size') with tf.variable_scope('outputs'), tf.name_scope('outputs'): model['%s_topsegmentvalues' %scope], model['%s_topsegmentindices' %scope] = tf.nn.top_k(tf.transpose(model['%s_inputs_segment' %scope], [1, 0]), config.getint('global', 'speaker_size')) model['%s_scores' %scope] = [tf.gather(feature, index) for feature, index in zip(model['%s_inputs_feature' %scope], tf.unstack(model['%s_topsegmentindices' %scope]))] model['%s_normalizedscores' %scope] = [tf.divide(score, tf.norm(score, 2, 1, True)) for score in model['%s_scores' %scope]] model['%s_outputs' %scope] = tf.add(0.5, tf.multiply(0.5, tf.stack([tf.matmul(score, score, transpose_b = True) for score in model['%s_normalizedscores' %scope]], name = '%s_outputs' %scope))) return model
def build_arch(input, is_train, num_classes): data_size = int(input.get_shape()[1]) # initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01) # bias_initializer = tf.constant_initializer(0.0) # weights_regularizer = tf.contrib.layers.l2_regularizer(5e-04) with slim.arg_scope([slim.conv2d], trainable=is_train):#, activation_fn=None, , , biases_initializer=bias_initializer, weights_regularizer=weights_regularizer with tf.variable_scope('conv1') as scope: output = slim.conv2d(input, num_outputs=256, kernel_size=[9, 9], stride=1, padding='VALID', scope=scope) data_size = data_size-8 assert output.get_shape() == [cfg.batch_size, data_size, data_size, 256] tf.logging.info('conv1 output shape: {}'.format(output.get_shape())) with tf.variable_scope('primary_caps_layer') as scope: output = slim.conv2d(output, num_outputs=32*8, kernel_size=[9, 9], stride=2, padding='VALID', scope=scope)#, activation_fn=None output = tf.reshape(output, [cfg.batch_size, -1, 8]) output = squash(output) data_size = int(np.floor((data_size-8)/2)) assert output.get_shape() == [cfg.batch_size, data_size*data_size*32, 8] tf.logging.info('primary capsule output shape: {}'.format(output.get_shape())) with tf.variable_scope('digit_caps_layer') as scope: with tf.variable_scope('u') as scope: u_hats = vec_transform(output, num_classes, 16) assert u_hats.get_shape() == [cfg.batch_size, num_classes, data_size*data_size*32, 16] tf.logging.info('digit_caps_layer u_hats shape: {}'.format(u_hats.get_shape())) with tf.variable_scope('routing') as scope: output = dynamic_routing(u_hats) assert output.get_shape() == [cfg.batch_size, num_classes, 16] tf.logging.info('the output capsule has shape: {}'.format(output.get_shape())) output_len = tf.norm(output, axis=-1) return output, output_len
def build_graph(self, left, right, gt_flow): x = self.preprocess(left, right) prediction = self.graph_structure(x) prediction = self.postprocess(prediction) tf.identity(prediction, name="prediction") # endpoint error tf.reduce_mean(tf.norm(prediction - gt_flow, axis=1), name='epe')
def step(self, inputs, states): # Split the hidden state into blocks (each U, V, W are shared across blocks). state = tf.split(states[0], self._num_blocks, axis=1) print('state after split', state) next_states = [] for j, state_j in enumerate(state): # Hidden State (j) key_j = tf.expand_dims(self._keys[j], axis=0) gate_j = self.get_gate(state_j, key_j, inputs) candidate_j = self.get_candidate(state_j, key_j, inputs, self.U, self.V, self.W, self.U_bias) # Equation 4: h_j <- h_j + g_j * h_j^~ # Perform an update of the hidden state (memory). state_j_next = state_j + tf.expand_dims(gate_j, -1) * candidate_j # Equation 5: h_j <- h_j / \norm{h_j} # Forget previous memories by normalization. state_j_next_norm = tf.norm( tensor=state_j_next, ord='euclidean', axis=-1, keep_dims=True) state_j_next_norm = tf.where( tf.greater(state_j_next_norm, 0.0), state_j_next_norm, tf.ones_like(state_j_next_norm)) state_j_next = state_j_next / state_j_next_norm next_states.append(state_j_next) state_next = tf.concat(next_states, axis=1) return state_next, [state_next]
def __call__(self, codes): """Use codebook to find nearest neighbor for each code. Args: codes: A `float`-like `Tensor` containing the latent vectors to be compared to the codebook. These are rank-3 with shape `[batch_size, latent_size, code_size]`. Returns: nearest_codebook_entries: The 1-nearest neighbor in Euclidean distance for each code in the batch. one_hot_assignments: The one-hot vectors corresponding to the matched codebook entry for each code in the batch. """ distances = tf.norm( tf.expand_dims(codes, 2) - tf.reshape(self.codebook, [1, 1, self.num_codes, self.code_size]), axis=3) assignments = tf.argmin(distances, 2) one_hot_assignments = tf.one_hot(assignments, depth=self.num_codes) nearest_codebook_entries = tf.reduce_sum( tf.expand_dims(one_hot_assignments, -1) * tf.reshape(self.codebook, [1, 1, self.num_codes, self.code_size]), axis=2) return nearest_codebook_entries, one_hot_assignments
def _PerCentroidNormalization(self, unnormalized_vector): """Perform per-centroid normalization. Args: unnormalized_vector: [KxD] float tensor. Returns: per_centroid_normalized_vector: [KxD] float tensor, with normalized aggregated residuals. Some residuals may be all-zero. visual_words: Int tensor containing indices of visual words which are present for the set of features. """ unnormalized_vector = tf.reshape( unnormalized_vector, [self._codebook_size, self._feature_dimensionality]) per_centroid_norms = tf.norm(unnormalized_vector, axis=1) visual_words = tf.reshape( tf.where( tf.greater(per_centroid_norms, tf.sqrt(_NORM_SQUARED_TOLERANCE))), [-1]) per_centroid_normalized_vector = tf.math.l2_normalize( unnormalized_vector, axis=1, epsilon=_NORM_SQUARED_TOLERANCE) return per_centroid_normalized_vector, visual_words
def arcface_loss(embedding, labels, out_num, w_init=None, s=64., m=0.5): ''' :param embedding: the input embedding vectors :param labels: the input labels, the shape should be eg: (batch_size, 1) :param s: scalar value default is 64 :param out_num: output class num :param m: the margin value, default is 0.5 :return: the final cacualted output, this output is send into the tf.nn.softmax directly ''' cos_m = math.cos(m) sin_m = math.sin(m) mm = sin_m * m # issue 1 threshold = math.cos(math.pi - m) with tf.variable_scope('arcface_loss'): # inputs and weights norm embedding_norm = tf.norm(embedding, axis=1, keep_dims=True) embedding = tf.div(embedding, embedding_norm, name='norm_embedding') weights = tf.get_variable(name='embedding_weights', shape=(embedding.get_shape().as_list()[-1], out_num), initializer=w_init, dtype=tf.float32) weights_norm = tf.norm(weights, axis=0, keep_dims=True) weights = tf.div(weights, weights_norm, name='norm_weights') # cos(theta+m) cos_t = tf.matmul(embedding, weights, name='cos_t') cos_t2 = tf.square(cos_t, name='cos_2') sin_t2 = tf.subtract(1., cos_t2, name='sin_2') sin_t = tf.sqrt(sin_t2, name='sin_t') cos_mt = s * tf.subtract(tf.multiply(cos_t, cos_m), tf.multiply(sin_t, sin_m), name='cos_mt') # this condition controls the theta+m should in range [0, pi] # 0<=theta+m<=pi # -m<=theta<=pi-m cond_v = cos_t - threshold cond = tf.cast(tf.nn.relu(cond_v, name='if_else'), dtype=tf.bool) keep_val = s*(cos_t - mm) cos_mt_temp = tf.where(cond, cos_mt, keep_val) mask = tf.one_hot(labels, depth=out_num, name='one_hot_mask') # mask = tf.squeeze(mask, 1) inv_mask = tf.subtract(1., mask, name='inverse_mask') s_cos_t = tf.multiply(s, cos_t, name='scalar_cos_t') output = tf.add(tf.multiply(s_cos_t, inv_mask), tf.multiply(cos_mt_temp, mask), name='arcface_loss_output') return output
def modulus(x): input_shape = x.get_shape().as_list() out = tf.norm(x, axis=len(input_shape) - 1) out = tf.expand_dims(out, axis=-1) out = tf.concat([out, tf.zeros_like(out)], axis=-1) return out
def build_model(self): print('\nBuilding Model') # Creating placeholders for the question and the answer self.questions = tf.placeholder(tf.int64, shape=[None, 15], name="question_vector") self.answers = tf.placeholder(tf.float32, shape=[None, self.most_freq_limit], name="answer_vector") self.images = tf.placeholder(tf.float32, shape=[None, 448, 448, 3], name="images_matrix") arg_scope = resnet_arg_scope() with tf.contrib.slim.arg_scope(arg_scope): resnet_features, _ = resnet_v2_152(self.images, reuse=tf.AUTO_REUSE) depth_norm = tf.norm(resnet_features, ord='euclidean', keepdims=True, axis=3) + 1e-8 self.image_features = resnet_features/depth_norm with tf.variable_scope("text_features") as scope: if self.reuse: scope.reuse_variables() self.word_embeddings = tf.get_variable('word_embeddings', [self.vocabulary_size, self.embedding_size], initializer=tf.contrib.layers.xavier_initializer()) word_vectors = tf.nn.embedding_lookup(self.word_embeddings, self.questions) len_word = self._len_seq(word_vectors) embedded_sentence = tf.nn.dropout(tf.nn.tanh(word_vectors, name="embedded_sentence"), keep_prob=self.dropout_prob) lstm = tf.nn.rnn_cell.LSTMCell(self.state_size, initializer=tf.contrib.layers.xavier_initializer()) _, final_state = tf.nn.dynamic_rnn(lstm, embedded_sentence, sequence_length=len_word, dtype=tf.float32) self.text_features = final_state.c self.attention_features = self.compute_attention(self.image_features, self.text_features) with tf.variable_scope("fully_connected") as scope: if self.reuse: scope.reuse_variables() self.fc1 = tf.nn.dropout(tf.nn.relu(self.fc_layer(self.attention_features, 1024, name="fc1")), keep_prob=self.dropout_prob) self.fc2 = self.fc_layer(self.fc1, 3000, name="fc2") self.answer_prob = tf.nn.softmax(self.fc2) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.answers, logits=self.fc2)) self.global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int32) self.inc = tf.assign_add(self.global_step, 1, name='increment') self.lr = tf.train.exponential_decay(learning_rate=self.init_lr, global_step=self.global_step, decay_steps=10000, decay_rate=0.5, staircase=True) self.optimizer = tf.train.AdamOptimizer(self.lr, beta1=0.9, beta2=0.999, name="optim")
def _uniform_unit_norm(dimension, shape, dtype, seed): """Returns a batch of points chosen uniformly from the unit hypersphere.""" # This works because the Gaussian distribution is spherically symmetric. # raw shape: shape + [dimension] raw = normal.Normal( loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)).sample( tf.concat([shape, [dimension]], axis=0), seed=seed()) unit_norm = raw / tf.norm(raw, ord=2, axis=-1)[..., tf.newaxis] return unit_norm
def monte_carlo_hypersphere_volume(dist, num_samples, radius, center): # https://en.wikipedia.org/wiki/Importance_sampling x = dist.sample(num_samples, seed=seed) x = tf.identity(x) # Invalidate bijector cacheing. inverse_log_prob = tf.exp(-dist.log_prob(x)) importance_weights = tf.where( tf.norm(x - center, axis=-1) <= radius, inverse_log_prob, tf.zeros_like(inverse_log_prob)) return tf.reduce_mean(importance_weights, axis=0)
def conv_block(inputs, num_units=None, size=5, rate=1, padding="SAME", dropout_rate=0, training=False, scope="conv_block", reuse=None): '''Convolution block. Args: inputs: A 3-D tensor with shape of [batch, time, depth]. size: An int. Filter size. padding: Either `same` or `valid` or `causal` (case-insensitive). norm_type: A string. See `normalize`. activation_fn: A string. Activation function. training: A boolean. Whether or not the layer is in training mode. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A tensor of the same shape and dtype as inputs. ''' in_dim = inputs.get_shape().as_list()[-1] if num_units is None: num_units = in_dim with tf.variable_scope(scope, reuse=reuse): inputs = tf.layers.dropout(inputs, rate=dropout_rate, training=training) if padding.lower() == "causal": # pre-padding for causality pad_len = (size - 1) * rate # padding size inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]]) padding = "VALID" V = tf.get_variable('V', shape=[size, in_dim, num_units*2], dtype=tf.float32, initializer=tf.contrib.layers.variance_scaling_initializer(factor=(4.*(1.-dropout_rate)))) # (width, in_dim, out_dim) g = tf.get_variable('g', dtype=tf.float32, initializer=tf.norm(V.initialized_value(), axis=(0, 1), keep_dims=True) ) b = tf.get_variable('b', shape=(num_units*2,), dtype=tf.float32, initializer=tf.zeros_initializer) V_norm = tf.nn.l2_normalize(V, [0, 1]) # (width, in_dim, out_dim) W = V_norm * g outputs = tf.nn.convolution(inputs, W, padding, dilation_rate=[rate]) + b outputs = glu(outputs) return outputs
def _test_model_fn(image, normalized_image, reuse): del normalized_image, reuse # Unused variables in the test. image_shape = tf.shape(image) attention = tf.squeeze(tf.norm(image, axis=3)) feature_map = tf.concat( [ tf.tile(image, [1, 1, 1, 341]), tf.zeros([1, image_shape[1], image_shape[2], 1]) ], axis=3) return attention, feature_map
def linear_mapping_weightnorm(inputs, out_dim, in_dim=None, dropout=1.0, var_scope_name="linear_mapping"): with tf.variable_scope(var_scope_name): input_shape = inputs.get_shape().as_list() # static shape. may has None input_shape_tensor = tf.shape(inputs) # use weight normalization (Salimans & Kingma, 2016) w = g* v/2-norm(v) V = tf.get_variable('V', shape=[int(input_shape[-1]), out_dim], dtype=tf.float32, initializer=tf.random_normal_initializer(mean=0, stddev=tf.sqrt(dropout*1.0/int(input_shape[-1]))), trainable=True) V_norm = tf.norm(V.initialized_value(), axis=0) # V shape is M*N, V_norm shape is N g = tf.get_variable('g', dtype=tf.float32, initializer=V_norm, trainable=True) b = tf.get_variable('b', shape=[out_dim], dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=True) # weightnorm bias is init zero assert len(input_shape) == 3 inputs = tf.reshape(inputs, [-1, input_shape[-1]]) inputs = tf.matmul(inputs, V) inputs = tf.reshape(inputs, [input_shape_tensor[0], -1, out_dim]) #inputs = tf.matmul(inputs, V) # x*v scaler = tf.div(g, tf.norm(V, axis=0)) # g/2-norm(v) inputs = tf.reshape(scaler,[1, out_dim])*inputs + tf.reshape(b,[1, out_dim]) # x*v g/2-norm(v) + b return inputs
def mlp(feature, hparams, name="mlp"): """Multi layer perceptron with dropout and relu activation.""" with tf.variable_scope(name, "mlp", values=[feature]): num_mlp_layers = hparams.num_mlp_layers mlp_size = hparams.mlp_size for _ in range(num_mlp_layers): feature = common_layers.dense(feature, mlp_size, activation=None) utils.collect_named_outputs("norms", "mlp_feature", tf.norm(feature, axis=-1)) feature = common_layers.layer_norm(feature) feature = tf.nn.relu(feature) feature = tf.nn.dropout(feature, keep_prob=1.-hparams.dropout) return feature
# compute loss # out:[b, 10] # y:[b] => [b ,10] y_onehot = tf.one_hot(y, depth=10) # mse = mean(sum(y - out)^2) # [b, 10] loss = tf.square(y_onehot - out) # mean: scalar loss = tf.reduce_mean(loss) # compute gradients grads = tape.gradient(loss, [w1, b1, w2, b2, w3, b3]) print("==before==") for g in grads: print(tf.norm(g)) grads, _ = tf.clip_by_global_norm(grads, 15) #限制梯度向量的范数不能超过15,超过的话会将其等比例缩小 print("==after==") for g in grads: print(tf.norm(g)) # w1 = w1 - learning_rate * w1_grad 必须使用assign进行原地更新 # 否则会从variable包装变成原tensor w1.assign_sub(lr * grads[0]) b1.assign_sub(lr * grads[1]) w2.assign_sub(lr * grads[2]) b2.assign_sub(lr * grads[3]) w3.assign_sub(lr * grads[4])
params.dict["arcsoftmax_lambda_min"] = 10 params.dict["arcsoftmax_lambda_base"] = 1000 params.dict["arcsoftmax_lambda_gamma"] = 1 params.dict["arcsoftmax_lambda_power"] = 4 params.dict["feature_norm"] = True params.dict["feature_scaling_factor"] = 20 from model.common import l2_scaling outputs, endpoints = tdnn(features, params, is_training=True, reuse_variables=False) outputs = l2_scaling(outputs, params.feature_scaling_factor) outputs_norm = tf.norm(outputs, axis=1) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) [outputs_val, outputs_norm_val] = sess.run([outputs, outputs_norm], feed_dict={features: features_val}) assert np.allclose(np.sqrt(np.sum(outputs_val**2, axis=1)), params.feature_scaling_factor) assert np.allclose(outputs_norm_val, params.feature_scaling_factor) # Test loss functions # It only works on debug mode, since the loss is asked to output weights for our numpy computation. from model.loss import asoftmax, additive_margin_softmax, additive_angular_margin_softmax from model.test_utils import compute_asoftmax, compute_amsoftmax, compute_arcsoftmax params.dict["global_step"] = 1
def vertex_normals(vertices, faces, name=None): """Computes vertex normals for the given meshes. This function takes a batch of meshes with common topology, and calculates vertex normals for each. Args: vertices: a `Tensor` of shape [*, vertex count, 3] or [*, vertex count, 4], where * represents arbitrarily many leading (batch) dimensions. faces: an int32 `Tensor` of shape [face count, 3]; each value is an index into the first dimension of `vertices`, and each row defines one triangle. name: an optional name for the operation Returns: a `Tensor` of shape [*, vertex count, 3], which for each vertex, gives the (normalised) average of the normals of all faces that include that vertex """ # This computes vertex normals, as the average of the normals of the faces each vertex is part of # vertices is indexed by *, vertex-index, x/y/z[/w] # faces is indexed by face-index, vertex-in-face # result is indexed by *, vertex-index, x/y/z with ops.name_scope(name, 'VertexNormals', [vertices, faces]) as scope: vertices, faces = _prepare_vertices_and_faces(vertices, faces) vertices = vertices[..., :3] # drop the w-coordinate if present vertices_ndim = vertices.get_shape().ndims # normals_by_face is indexed by face-index, *, x/y/z normals_by_face, vertices_by_index = _get_face_normals(vertices, faces) face_count = tf.shape(faces)[0] vbi_shape = tf.shape(vertices_by_index) # this is the number of 'elements' in the * dimensions N_extra = tf.reduce_prod(vbi_shape[1:-1]) # ** keep it simple for now; in the general case we need a flattened outer product of ranges assert vertices_ndim in {2, 3} if vertices_ndim == 2: extra_indices = [] else: extra_indices = [ tf.tile(_repeat_1d(tf.range(N_extra), 3), [face_count * 3])] normals_by_face_and_vertex = tf.SparseTensor( indices=tf.cast( tf.stack([ # each element of this stack is repeated a number of times matching the things after, then tiled a number of times matching the things before, so that each has the same length _repeat_1d( tf.range(face_count, dtype=tf.int32), N_extra * 9), _repeat_1d(tf.reshape(faces, [-1]), N_extra * 3) ] + extra_indices + [ tf.tile(tf.constant([0, 1, 2], dtype=tf.int32), tf.convert_to_tensor( [face_count * N_extra * 3])) ], axis=1), tf.int64 ), values=tf.reshape(tf.tile(normals_by_face[:, tf.newaxis, ...], [ 1, 3] + [1] * (vertices_ndim - 1)), [-1]), dense_shape=tf.cast( tf.concat([[face_count], vbi_shape], axis=0), tf.int64) ) # indexed by face-index, vertex-index, *, x/y/z summed_normals_by_vertex = tf.sparse_reduce_sum( normals_by_face_and_vertex, axis=0) # indexed by vertex-index, *, x/y/z renormalised_normals_by_vertex = summed_normals_by_vertex / \ (tf.norm(summed_normals_by_vertex, axis=-1, keepdims=True) + 1.e-12) # ditto result = tf.transpose(renormalised_normals_by_vertex, range( 1, vertices_ndim - 1) + [0, vertices_ndim - 1]) result.set_shape(vertices.get_shape()) return result
def __init__(self, sess: tf.Session, predict: Union[Callable, tf.keras.Model, 'keras.Model'], shape: tuple, kappa: float = 0., beta: float = .1, feature_range: tuple = (-1e10, 1e10), gamma: float = 0., ae_model: Union[tf.keras.Model, 'keras.Model'] = None, enc_model: Union[tf.keras.Model, 'keras.Model'] = None, theta: float = 0., use_kdtree: bool = False, learning_rate_init: float = 1e-2, max_iterations: int = 1000, c_init: float = 10., c_steps: int = 10, eps: tuple = (1e-3, 1e-3), clip: tuple = (-1000., 1000.), update_num_grad: int = 1, write_dir: str = None) -> None: """ Initialize prototypical counterfactual method. Parameters ---------- sess TensorFlow session predict Keras or TensorFlow model or any other model's prediction function returning class probabilities shape Shape of input data starting with batch size kappa Confidence parameter for the attack loss term beta Regularization constant for L1 loss term feature_range Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or numpy arrays with dimension (1x nb of features) for feature-wise ranges gamma Regularization constant for optional auto-encoder loss term ae_model Optional auto-encoder model used for loss regularization enc_model Optional encoder model used to guide instance perturbations towards a class prototype theta Constant for the prototype search loss term use_kdtree Whether to use k-d trees for the prototype loss term if no encoder is available learning_rate_init Initial learning rate of optimizer max_iterations Maximum number of iterations for finding a counterfactual c_init Initial value to scale the attack loss term c_steps Number of iterations to adjust the constant scaling the attack loss term eps If numerical gradients are used to compute dL/dx = (dL/dp) * (dp/dx), then eps[0] is used to calculate dL/dp and eps[1] is used for dp/dx. eps[0] and eps[1] can be a combination of float values and numpy arrays. For eps[0], the array dimension should be (1x nb of prediction categories) and for eps[1] it should be (1x nb of features) clip Tuple with min and max clip ranges for both the numerical gradients and the gradients obtained from the TensorFlow graph update_num_grad If numerical gradients are used, they will be updated every update_num_grad iterations write_dir Directory to write tensorboard files to """ self.sess = sess self.predict = predict # check whether the model, encoder and auto-encoder are Keras or TF models try: import keras # noqa is_model = isinstance(predict, (tf.keras.Model, keras.Model)) is_ae = isinstance(ae_model, (tf.keras.Model, keras.Model)) is_enc = isinstance(enc_model, (tf.keras.Model, keras.Model)) except ImportError: is_model = isinstance(predict, (tf.keras.Model)) is_ae = isinstance(ae_model, (tf.keras.Model)) is_enc = isinstance(enc_model, (tf.keras.Model)) if is_model: self.model = True self.classes = self.sess.run( self.predict( tf.convert_to_tensor(np.zeros(shape), dtype=tf.float32))).shape[1] else: self.model = False self.classes = self.predict(np.zeros(shape)).shape[1] if is_enc: self.enc_model = True else: self.enc_model = False if is_ae: self.ae_model = True else: self.ae_model = False if use_kdtree and self.enc_model: logger.warning( 'Both an encoder and k-d trees enabled. Using the encoder for the prototype loss term.' ) if use_kdtree or self.enc_model: self.enc_or_kdtree = True else: self.enc_or_kdtree = False self.shape = shape self.kappa = kappa self.beta = beta self.gamma = gamma self.theta = theta self.ae = ae_model self.enc = enc_model self.use_kdtree = use_kdtree self.batch_size = shape[0] self.max_iterations = max_iterations self.c_init = c_init self.c_steps = c_steps self.update_num_grad = update_num_grad self.eps = eps self.clip = clip self.write_dir = write_dir # define tf variables for original and perturbed instances, and target labels self.orig = tf.Variable(np.zeros(shape), dtype=tf.float32, name='orig') self.adv = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv') self.adv_s = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv_s') self.target = tf.Variable(np.zeros((self.batch_size, self.classes)), dtype=tf.float32, name='target') # variable for target class proto if self.enc_model: self.shape_enc = self.enc.predict(np.zeros(shape)).shape else: self.shape_enc = shape self.target_proto = tf.Variable(np.zeros(self.shape_enc), dtype=tf.float32, name='target_proto') # define tf variable for constant used in FISTA optimization self.const = tf.Variable(np.zeros(self.batch_size), dtype=tf.float32, name='const') self.global_step = tf.Variable(0.0, trainable=False, name='global_step') # define placeholders that will be assigned to relevant variables self.assign_orig = tf.placeholder(tf.float32, shape, name='assign_orig') self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv') self.assign_adv_s = tf.placeholder(tf.float32, shape, name='assign_adv_s') self.assign_target = tf.placeholder(tf.float32, (self.batch_size, self.classes), name='assign_target') self.assign_const = tf.placeholder(tf.float32, [self.batch_size], name='assign_const') self.assign_target_proto = tf.placeholder(tf.float32, self.shape_enc, name='assign_target_proto') # define conditions and values for element-wise shrinkage thresholding with tf.name_scope('shrinkage_thresholding') as scope: cond = [ tf.cast( tf.greater(tf.subtract(self.adv_s, self.orig), self.beta), tf.float32), tf.cast( tf.less_equal(tf.abs(tf.subtract(self.adv_s, self.orig)), self.beta), tf.float32), tf.cast( tf.less(tf.subtract(self.adv_s, self.orig), tf.negative(self.beta)), tf.float32) ] upper = tf.minimum(tf.subtract(self.adv_s, self.beta), tf.cast(feature_range[1], tf.float32)) lower = tf.maximum(tf.add(self.adv_s, self.beta), tf.cast(feature_range[0], tf.float32)) self.assign_adv = tf.multiply(cond[0], upper) + tf.multiply( cond[1], self.orig) + tf.multiply(cond[2], lower) # perturbation update and vector projection on correct feature range set with tf.name_scope('perturbation_y') as scope: self.zt = tf.divide(self.global_step, self.global_step + tf.cast(3, tf.float32)) self.assign_adv_s = self.assign_adv + tf.multiply( self.zt, self.assign_adv - self.adv) # map to feature space self.assign_adv_s = tf.minimum( self.assign_adv_s, tf.cast(feature_range[1], tf.float32)) self.assign_adv_s = tf.maximum( self.assign_adv_s, tf.cast(feature_range[0], tf.float32)) # assign counterfactual of step k+1 to k with tf.name_scope('update_adv') as scope: self.adv_updater = tf.assign(self.adv, self.assign_adv) self.adv_updater_s = tf.assign(self.adv_s, self.assign_adv_s) # from perturbed instance, derive deviation delta with tf.name_scope('update_delta') as scope: self.delta = self.orig - self.adv self.delta_s = self.orig - self.adv_s # define L1 and L2 loss terms; L1+L2 is later used as an optimization constraint for FISTA ax_sum = list(np.arange(1, len(shape))) with tf.name_scope('loss_l1_l2') as scope: self.l2 = tf.reduce_sum(tf.square(self.delta), axis=ax_sum) self.l2_s = tf.reduce_sum(tf.square(self.delta_s), axis=ax_sum) self.l1 = tf.reduce_sum(tf.abs(self.delta), axis=ax_sum) self.l1_s = tf.reduce_sum(tf.abs(self.delta_s), axis=ax_sum) self.l1_l2 = self.l2 + tf.multiply(self.l1, self.beta) self.l1_l2_s = self.l2_s + tf.multiply(self.l1_s, self.beta) # sum losses self.loss_l1 = tf.reduce_sum(self.l1) self.loss_l1_s = tf.reduce_sum(self.l1_s) self.loss_l2 = tf.reduce_sum(self.l2) self.loss_l2_s = tf.reduce_sum(self.l2_s) with tf.name_scope('loss_ae') as scope: # gamma * AE loss if self.ae_model: self.loss_ae = self.gamma * tf.square( tf.norm(self.ae(self.adv) - self.adv)) self.loss_ae_s = self.gamma * tf.square( tf.norm(self.ae(self.adv_s) - self.adv_s)) else: # no auto-encoder available self.loss_ae = tf.constant(0.) self.loss_ae_s = tf.constant(0.) with tf.name_scope('loss_attack') as scope: if not self.model: self.loss_attack = tf.placeholder(tf.float32) elif self.c_init == 0. and self.c_steps == 1: # prediction loss term not used # make predictions on perturbed instance self.pred_proba = self.predict(self.adv) self.pred_proba_s = self.predict(self.adv_s) self.loss_attack = tf.constant(0.) self.loss_attack_s = tf.constant(0.) else: # make predictions on perturbed instance self.pred_proba = self.predict(self.adv) self.pred_proba_s = self.predict(self.adv_s) # probability of target label prediction self.target_proba = tf.reduce_sum( self.target * self.pred_proba, 1) target_proba_s = tf.reduce_sum(self.target * self.pred_proba_s, 1) # max probability of non target label prediction self.nontarget_proba_max = tf.reduce_max( (1 - self.target) * self.pred_proba - (self.target * 10000), 1) nontarget_proba_max_s = tf.reduce_max( (1 - self.target) * self.pred_proba_s - (self.target * 10000), 1) # loss term f(x,d) loss_attack = tf.maximum( 0.0, -self.nontarget_proba_max + self.target_proba + self.kappa) loss_attack_s = tf.maximum( 0.0, -nontarget_proba_max_s + target_proba_s + self.kappa) # c * f(x,d) self.loss_attack = tf.reduce_sum(self.const * loss_attack) self.loss_attack_s = tf.reduce_sum(self.const * loss_attack_s) with tf.name_scope('loss_prototype') as scope: if self.enc_model: self.loss_proto = self.theta * tf.square( tf.norm(self.enc(self.adv) - self.target_proto)) self.loss_proto_s = self.theta * tf.square( tf.norm(self.enc(self.adv_s) - self.target_proto)) elif self.use_kdtree: self.loss_proto = self.theta * tf.square( tf.norm(self.adv - self.target_proto)) self.loss_proto_s = self.theta * tf.square( tf.norm(self.adv_s - self.target_proto)) else: # no encoder available and no k-d trees used self.loss_proto = tf.constant(0.) self.loss_proto_s = tf.constant(0.) with tf.name_scope('loss_combined') as scope: # no need for L1 term in loss to optimize when using FISTA if self.model: self.loss_opt = self.loss_attack_s + self.loss_l2_s + self.loss_ae_s + self.loss_proto_s else: # separate numerical computation of loss attack gradient self.loss_opt = self.loss_l2_s + self.loss_ae_s + self.loss_proto_s # add L1 term to overall loss; this is not the loss that will be directly optimized self.loss_total = (self.loss_attack + self.loss_l2 + self.loss_ae + tf.multiply(self.beta, self.loss_l1) + self.loss_proto) with tf.name_scope('training') as scope: self.learning_rate = tf.train.polynomial_decay(learning_rate_init, self.global_step, self.max_iterations, 0, power=0.5) optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) start_vars = set(x.name for x in tf.global_variables()) # first compute, then apply grads self.compute_grads = optimizer.compute_gradients( self.loss_opt, var_list=[self.adv_s]) self.grad_ph = tf.placeholder(tf.float32, name='grad_adv_s') var = [ tvar for tvar in tf.trainable_variables() if tvar.name.startswith('adv_s') ][-1] # get the last in # case explainer is re-initialized and a new graph is created grad_and_var = [(self.grad_ph, var)] self.apply_grads = optimizer.apply_gradients( grad_and_var, global_step=self.global_step) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] # variables to initialize self.setup = [] # type: list self.setup.append(self.orig.assign(self.assign_orig)) self.setup.append(self.target.assign(self.assign_target)) self.setup.append(self.const.assign(self.assign_const)) self.setup.append(self.adv.assign(self.assign_adv)) self.setup.append(self.adv_s.assign(self.assign_adv_s)) self.setup.append(self.target_proto.assign(self.assign_target_proto)) self.init = tf.variables_initializer( var_list=[self.global_step] + [self.adv_s] + [self.adv] + new_vars) if self.write_dir is not None: self.writer = tf.summary.FileWriter(write_dir, tf.get_default_graph()) self.writer.add_graph(tf.get_default_graph()) else: self.writer = None
def block_Lanczos(Sigma_, B_): """ block Lanczos method to approx Sigma^1/2 * B, with B matrix of N(0,1)'s. Used to generate multiple approximate large normal draws. """ n = tf.shape(B_)[0] s = tf.shape(B_)[1] k = tf.div(n, 500) + 3 betas = tf.zeros([1, s]) alphas = tf.zeros([0, s]) D = tf.zeros([s, n, 1]) B_norms = tf.norm(B_, axis=0) D = tf.concat([D, tf.expand_dims(tf.transpose(B_ / B_norms), 2)], 2) def cond(j, alphas, betas, D): return j < k + 1 #TODO: use block-CG in place of Sigma def body(j, alphas, betas, D): d_j = tf.squeeze(tf.slice(D, [0, 0, j], [-1, -1, 1])) d = tf.matmul(Sigma_, tf.transpose(d_j)) - ( tf.slice(betas, [j - 1, 0], [1, -1]) * tf.transpose(tf.squeeze(tf.slice(D, [0, 0, j - 1], [-1, -1, 1])))) alphas = tf.concat([alphas, [tf.diag_part(tf.matmul(d_j, d))]], 0) d = d - tf.slice(alphas, [j - 1, 0], [1, -1]) * tf.transpose(d_j) betas = tf.concat([betas, [tf.norm(d, axis=0)]], 0) D = tf.concat([ D, tf.expand_dims(tf.transpose(d / tf.slice(betas, [j, 0], [1, -1])), 2) ], 2) return j + 1, alphas, betas, D j = tf.constant(1) j, alphas, betas, D = tf.while_loop(cond, body, loop_vars=[j, alphas, betas, D], shape_invariants=[ j.get_shape(), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, None]) ]) D_ = tf.slice(D, [0, 0, 1], [-1, -1, k]) ##TODO replace loop H = tf.zeros([0, k, k]) for ss in range(s): this_beta = tf.diag(tf.squeeze(tf.slice(betas, [1, ss], [k - 1, 1]))) #build out tridiagonal H: alphas_1:k on main, betas_2:k on off this_H = (tf.diag(tf.squeeze(tf.slice(alphas, [0, ss], [-1, 1]))) + tf.pad(this_beta, [[1, 0], [0, 1]]) + tf.pad(this_beta, [[0, 1], [1, 0]])) H = tf.concat([H, tf.expand_dims(this_H, 0)], 0) E, V = tf.self_adjoint_eig(H) E_sqrt = tf.zeros([0, k, k]) #TODO: loop for ss in range(s): E_sqrt = tf.concat([ E_sqrt, tf.expand_dims( tf.diag( tf.squeeze( tf.sqrt(tf.maximum(tf.slice(E, [ss, 0], [1, -1]), 1e-6)))), 0) ], 0) sq_H = tf.matmul(V, tf.matmul(E_sqrt, tf.transpose(V, perm=[0, 2, 1]))) e1 = tf.expand_dims( tf.transpose(tf.tile(tf.slice(tf.eye(k), [0, 0], [-1, 1]), [1, s])), 2) out = B_norms * tf.transpose(tf.squeeze(tf.matmul(D_, tf.matmul(sq_H, e1)))) return out
def build_model(self): min_queue_examples = 256 self.g_zbatch = tf.placeholder(tf.float32, [self.batch_size, self.z_dim], 'zbatch') image_dims_real = [self.images_height_real, self.images_width_real, 3] image_dims_synth = [ self.images_height_synth, self.images_width_synth, 3 ] #lbl_dims = [self.images_height, self.images_width, 1] self.Discr_inputs_real = tf.placeholder(tf.float32, [self.batch_size] + image_dims_real, name='D_images_real') self.Discr_inputs_synth = tf.placeholder( tf.float32, [self.batch_size] + image_dims_synth, name='D_images_synth') self.Gen_inputs_imgs = tf.placeholder(tf.float32, [self.batch_size] + image_dims_synth, name='G_images') # with tf.variable_scope('model') as scope: # ## sensor transformer augmentation generator img_train_aug, window_h, sigmas, scale_val, tx_Rval, ty_Rval, tx_Gval, ty_Gval, tx_Bval, ty_Bval, delta_S, A, Ra_sd, Rb_si, Ga_sd, Gb_si, Ba_sd, Bb_si, a_transl, b_transl = self.augmentation_generator( self.Gen_inputs_imgs, self.g_zbatch) #self.aug_img, window_h, sigmas, scale_val, tx_Rval, ty_Rval, tx_Gval, ty_Gval, tx_Bval, ty_Bval, delta_S, A, a_transl, b_transl = self.augmentation_generator(img_train_synth, self.Gen_zbatch) self.aug_img, self.blurSTparams, self.expSTparams, self.colorSTparams, self.noiseSTparams, self.chromabSTparams = self.augmentation_generator_sampler( self.Gen_inputs_imgs, self.g_zbatch, reuse=True) #img_train_aug, self.blurSTparams, self.expSTparams, self.colorSTparams, self.noiseSTparams, self.chromabSTparams = self.augmentation_generator(img_train_synth, self.g_zbatch) # ## get style loss #scope.reuse_variables() conv1_1activ_aug, conv1_2activ_aug, conv2_1activ_aug, conv2_2activ_aug, conv3_1activ_aug, conv3_2activ_aug, conv3_3activ_aug, conv4_1activ_aug, conv4_2activ_aug, conv4_3activ_aug = self.net_synth( img_train_aug, None, get_activ=True) scope.reuse_variables() conv1_1activ_real, conv1_2activ_real, conv2_1activ_real, conv2_2activ_real, conv3_1activ_real, conv3_2activ_real, conv3_3activ_real, conv4_1activ_real, conv4_2activ_real, conv4_3activ_real = self.net_real( self.Discr_inputs_real, None, get_activ=True) ## calculate style loss on the early layers self.style_loss = tf.reduce_sum( tf.square(tf.norm(self.gram_matrix(conv1_1activ_aug) - self.gram_matrix(conv1_1activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv1_2activ_aug) - self.gram_matrix(conv1_2activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv2_1activ_aug) - self.gram_matrix(conv2_1activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv2_2activ_aug) - self.gram_matrix(conv2_2activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv3_1activ_aug) - self.gram_matrix(conv3_1activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv3_2activ_aug) - self.gram_matrix(conv3_2activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv3_3activ_aug) - self.gram_matrix(conv3_3activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv4_1activ_aug) - self.gram_matrix(conv4_1activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv4_2activ_aug) - self.gram_matrix(conv4_2activ_real))) + \ tf.square(tf.norm(self.gram_matrix(conv4_3activ_aug) - self.gram_matrix(conv4_3activ_real))) ) ## blur constraints to prevent it from going to 0 (which will give nans) #self.sigmas_loss = -tf.minimum(tf.reduce_min(sigmas),0)*100000 ## calc total loss self.loss_train = self.style_loss / 1e6 #self.loss_train = self.style_loss/1e6 + self.sigmas_loss ## with tf.variable_scope('optimizer'): self.step = tf.placeholder(tf.float32, [], 'step') lr = self.learning_rate * tf.exp(-self.step / self.tau) self.train_step = tf.train.AdamOptimizer(lr).minimize( self.loss_train) # tf.summary.scalar('learning rate', lr) if self.log_weights: for var in tf.trainable_variables(): tf.summary.histogram(var.name, var)
def main(): args = parser.parse_args() # We store all arguments in a json file. This has two advantages: # 1. We can always get back and see what exactly that experiment was # 2. We can resume an experiment as-is without needing to remember all flags. args_file = os.path.join(args.experiment_root, 'args.json') if args.resume: if not os.path.isfile(args_file): raise IOError('`args.json` not found in {}'.format(args_file)) print('Loading args from {}.'.format(args_file)) with open(args_file, 'r') as f: args_resumed = json.load(f) args_resumed['resume'] = True # This would be overwritten. # When resuming, we not only want to populate the args object with the # values from the file, but we also want to check for some possible # conflicts between loaded and given arguments. for key, value in args.__dict__.items(): if key in args_resumed: resumed_value = args_resumed[key] if resumed_value != value: print('Warning: For the argument `{}` we are using the' ' loaded value `{}`. The provided value was `{}`' '.'.format(key, resumed_value, value)) comand = input('Would you like to restore it?(yes/no)') if comand == 'yes': args.__dict__[key] = resumed_value print( 'For the argument `{}` we are using the loaded value `{}`.' .format(key, args.__dict__[key])) else: print( 'For the argument `{}` we are using the provided value `{}`.' .format(key, args.__dict__[key])) else: print('Warning: A new argument was added since the last run:' ' `{}`. Using the new value: `{}`.'.format(key, value)) os.remove(args_file) with open(args_file, 'w') as f: json.dump(vars(args), f, ensure_ascii=False, indent=2, sort_keys=True) else: # If the experiment directory exists already, we bail in fear. if os.path.exists(args.experiment_root): if os.listdir(args.experiment_root): print('The directory {} already exists and is not empty.' ' If you want to resume training, append --resume to' ' your call.'.format(args.experiment_root)) exit(1) else: os.makedirs(args.experiment_root) # Store the passed arguments for later resuming and grepping in a nice # and readable format. with open(args_file, 'w') as f: json.dump(vars(args), f, ensure_ascii=False, indent=2, sort_keys=True) log_file = os.path.join(args.experiment_root, "train") logging.config.dictConfig(common.get_logging_dict(log_file)) log = logging.getLogger('train') # Also show all parameter values at the start, for ease of reading logs. log.info('Training using the following parameters:') for key, value in sorted(vars(args).items()): log.info('{}: {}'.format(key, value)) # Check them here, so they are not required when --resume-ing. if not args.train_set: parser.print_help() log.error("You did not specify the `train_set` argument!") sys.exit(1) if not args.image_root: parser.print_help() log.error("You did not specify the required `image_root` argument!") sys.exit(1) # Load the data from the TxT file. see Common.load_dataset function for details pids, fids = common.load_dataset(args.train_set, args.image_root) max_fid_len = max(map(len, fids)) # We'll need this later for logfiles. # Setup a tf.Dataset where one "epoch" loops over all PIDS. # PIDS are shuffled after every epoch and continue indefinitely. unique_pids = np.unique(pids) dataset = tf.data.Dataset.from_tensor_slices(unique_pids) dataset = dataset.shuffle(len(unique_pids)) # Constrain the dataset size to a multiple of the batch-size, so that # we don't get overlap at the end of each epoch. dataset = dataset.take((len(unique_pids) // args.batch_p) * args.batch_p) dataset = dataset.repeat(None) # Repeat forever. Funny way of stating it. # For every PID, get K images. dataset = dataset.map(lambda pid: sample_k_fids_for_pid( pid, all_fids=fids, all_pids=pids, batch_k=args.batch_k )) # now the dataset has been modified as [selected_fids # , pid] due to the return of the function 'sample_k_fids_for_pid' # Ungroup/flatten the batches for easy loading of the files. dataset = dataset.apply(tf.contrib.data.unbatch()) # Convert filenames to actual image tensors. net_input_size = (args.net_input_height, args.net_input_width) pre_crop_size = (args.pre_crop_height, args.pre_crop_width) dataset = dataset.map( lambda fid, pid: common.fid_to_image(fid, pid, image_root=args.image_root, image_size=pre_crop_size if args. crop_augment else net_input_size), num_parallel_calls=args.loading_threads ) # now the dataset has been modified as [selected_images # , fid, pid] due to the return of the function 'fid_to_image' # Augment the data if specified by the arguments. if args.flip_augment: dataset = dataset.map(lambda im, fid, pid: (tf.image.random_flip_left_right(im), fid, pid)) if args.crop_augment: dataset = dataset.map(lambda im, fid, pid: (tf.random_crop( im, net_input_size + (3, )), fid, pid)) # Group it back into PK batches. batch_size = args.batch_p * args.batch_k dataset = dataset.batch(batch_size) # Overlap producing and consuming for parallelism. dataset = dataset.prefetch(1) # Since we repeat the data infinitely, we only need a one-shot iterator. images, fids, pids = dataset.make_one_shot_iterator().get_next() # Create the model and an embedding head. model = import_module('nets.' + args.model_name) head = import_module('heads.' + args.head_name) # Feed the image through the model. The returned `body_prefix` will be used # further down to load the pre-trained weights for all variables with this # prefix. endpoints, body_prefix = model.endpoints(images, is_training=True) if args.head_name == 'fusion': with tf.name_scope('head'): endpoints = head.head(endpoints, args.embedding_dim, args.model_name, is_training=True) else: with tf.name_scope('head'): endpoints = head.head(endpoints, args.embedding_dim, is_training=True) # Create the loss in two steps: # 1. Compute all pairwise distances according to the specified metric. # 2. For each anchor along the first dimension, compute its loss. # dists = loss.cdist(endpoints['emb'], endpoints['emb'], metric=args.metric) # losses, train_top1, prec_at_k, _, neg_dists, pos_dists = loss.LOSS_CHOICES[args.loss]( # dists, pids, args.margin, batch_precision_at_k=args.batch_k-1) # # '_' stands for the boolean matrix shows topK where the correct match of the identities occurs # shape=(batch_size,K) # 更改loss1 dists1 = loss.cdist(endpoints['feature1'], endpoints['feature1'], metric=args.metric) losses1, _, _, _, _, _ = loss.LOSS_CHOICES[args.loss]( dists1, pids, args.margin, batch_precision_at_k=args.batch_k - 1) dists2 = loss.cdist(endpoints['feature2'], endpoints['feature2'], metric=args.metric) losses2, _, _, _, _, _ = loss.LOSS_CHOICES[args.loss]( dists2, pids, args.margin, batch_precision_at_k=args.batch_k - 1) dists3 = loss.cdist(endpoints['feature3'], endpoints['feature3'], metric=args.metric) losses3, _, _, _, _, _ = loss.LOSS_CHOICES[args.loss]( dists3, pids, args.margin, batch_precision_at_k=args.batch_k - 1) dists4 = loss.cdist(endpoints['feature4'], endpoints['feature4'], metric=args.metric) losses4, _, _, _, _, _ = loss.LOSS_CHOICES[args.loss]( dists4, pids, args.margin, batch_precision_at_k=args.batch_k - 1) dists_fu = loss.cdist(endpoints['fusion_layer'], endpoints['fusion_layer'], metric=args.metric) losses_fu, train_top1, prec_at_k, _, neg_dists, pos_dists = loss.LOSS_CHOICES[ args.loss](dists_fu, pids, args.margin, batch_precision_at_k=args.batch_k - 1) losses = losses1 + losses2 + losses3 + losses4 + losses_fu # losses, train_top1, prec_at_k, _, neg_dists, pos_dists = loss_m.LOSS_CHOICES[args.loss]( # endpoints, pids, args.margin, args.model_name, batch_precision_at_k=args.batch_k - 1, metric =args.metric # ) # Count the number of active entries, and compute the total batch loss. num_active = tf.reduce_sum(tf.cast(tf.greater(losses, 1e-5), tf.float32)) # 此处losses即为 pospair 比 negpair+margin 还大的部分 loss_mean = tf.reduce_mean(losses) # Some logging for tensorboard. tf.summary.histogram('loss_distribution', losses) tf.summary.scalar('loss', loss_mean) tf.summary.scalar('batch_top1', train_top1) tf.summary.scalar('batch_prec_at_{}'.format(args.batch_k - 1), prec_at_k) tf.summary.scalar('active_count', num_active) #tf.summary.histogram('embedding_dists', dists) tf.summary.histogram('embedding_pos_dists', pos_dists) tf.summary.histogram('embedding_neg_dists', neg_dists) tf.summary.histogram('embedding_lengths', tf.norm(endpoints['emb_raw'], axis=1)) # Create the mem-mapped arrays in which we'll log all training detail in # addition to tensorboard, because tensorboard is annoying for detailed # inspection and actually discards data in histogram summaries. if args.detailed_logs: log_embs = lb.create_or_resize_dat( os.path.join(args.experiment_root, 'embeddings'), dtype=np.float32, shape=(args.train_iterations, batch_size, args.embedding_dim)) log_loss = lb.create_or_resize_dat( os.path.join(args.experiment_root, 'losses'), dtype=np.float32, shape=(args.train_iterations, batch_size)) log_fids = lb.create_or_resize_dat( os.path.join(args.experiment_root, 'fids'), dtype='S' + str(max_fid_len), shape=(args.train_iterations, batch_size)) # These are collected here before we add the optimizer, because depending # on the optimizer, it might add extra slots, which are also global # variables, with the exact same prefix. model_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, body_prefix) # Define the optimizer and the learning-rate schedule. # Unfortunately, we get NaNs if we don't handle no-decay separately. global_step = tf.Variable( 0, name='global_step', trainable=False) # 'global_step' means the number of batches seen # by graph if 0 <= args.decay_start_iteration < args.train_iterations: learning_rate = tf.train.exponential_decay( args.learning_rate, tf.maximum(0, global_step - args.decay_start_iteration ), # decay every 'lr_decay_steps' after the # 'decay_start_iteration' # args.train_iterations - args.decay_start_iteration, args.weight_decay_factor) args.lr_decay_steps, args.lr_decay_factor, staircase=True) else: learning_rate = args.learning_rate # the case when we set 'decay_start_iteration' as -1 tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate, epsilon=1e-3) # Feel free to try others! # optimizer = tf.train.AdadeltaOptimizer(learning_rate) # Update_ops are used to update batchnorm stats. with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = optimizer.minimize(loss_mean, global_step=global_step) # Define a saver for the complete model. checkpoint_saver = tf.train.Saver(max_to_keep=0) with tf.Session(config=config) as sess: if args.resume: # In case we're resuming, simply load the full checkpoint to init. last_checkpoint = tf.train.latest_checkpoint(args.experiment_root) log.info('Restoring from checkpoint: {}'.format(last_checkpoint)) checkpoint_saver.restore(sess, last_checkpoint) else: # But if we're starting from scratch, we may need to load some # variables from the pre-trained weights, and random init others. sess.run(tf.global_variables_initializer()) if args.initial_checkpoint is not None: saver = tf.train.Saver(model_variables) saver.restore( sess, args.initial_checkpoint ) # restore the pre-trained parameter from online model # In any case, we also store this initialization as a checkpoint, # such that we could run exactly re-producable experiments. checkpoint_saver.save(sess, os.path.join(args.experiment_root, 'checkpoint'), global_step=0) merged_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(args.experiment_root, sess.graph) start_step = sess.run(global_step) log.info('Starting training from iteration {}.'.format(start_step)) # Finally, here comes the main-loop. This `Uninterrupt` is a handy # utility such that an iteration still finishes on Ctrl+C and we can # stop the training cleanly. with lb.Uninterrupt(sigs=[SIGINT, SIGTERM], verbose=True) as u: for i in range(start_step, args.train_iterations): # Compute gradients, update weights, store logs! start_time = time.time() _, summary, step, b_prec_at_k, b_embs, b_loss, b_fids = \ sess.run([train_op, merged_summary, global_step, prec_at_k, endpoints['emb'], losses, fids]) elapsed_time = time.time() - start_time # Compute the iteration speed and add it to the summary. # We did observe some weird spikes that we couldn't track down. summary2 = tf.Summary() summary2.value.add(tag='secs_per_iter', simple_value=elapsed_time) summary_writer.add_summary(summary2, step) summary_writer.add_summary(summary, step) if args.detailed_logs: log_embs[i], log_loss[i], log_fids[ i] = b_embs, b_loss, b_fids # Do a huge print out of the current progress. seconds_todo = (args.train_iterations - step) * elapsed_time log.info( 'iter:{:6d}, loss min|avg|max: {:.3f}|{:.3f}|{:6.3f}, ' 'batch-p@{}: {:.2%}, ETA: {} ({:.2f}s/it)'.format( step, float(np.min(b_loss)), float(np.mean(b_loss)), float(np.max(b_loss)), args.batch_k - 1, float(b_prec_at_k), timedelta(seconds=int(seconds_todo)), elapsed_time)) sys.stdout.flush() sys.stderr.flush() # Save a checkpoint of training every so often. if (args.checkpoint_frequency > 0 and step % args.checkpoint_frequency == 0): checkpoint_saver.save(sess, os.path.join(args.experiment_root, 'checkpoint'), global_step=step) # Stop the main-loop at the end of the step, if requested. if u.interrupted: log.info("Interrupted on request!") break # Store one final checkpoint. This might be redundant, but it is crucial # in case intermediate storing was disabled and it saves a checkpoint # when the process was interrupted. checkpoint_saver.save(sess, os.path.join(args.experiment_root, 'checkpoint'), global_step=step)
train_nll = tf.reduce_mean([env[0][0], env[1][0], env[2][0]]) train_accuracy = tf.reduce_mean([env[0][1], env[1][1], env[2][1]]) train_penalty = tf.reduce_mean([env[0][2], env[1][2], env[2][2]]) test_accuracy = env[3][1] train_loss(train_nll) train_acc(train_accuracy) test_acc(test_accuracy) tape_src.watch(train_nll) weight_norm = tf.zeros(1,1) for w in model.trainable_variables: weight_norm += tf.norm(w)**2 loss = train_nll loss += flags.l2_regularizer_weight * weight_norm penalty_weight = (flags.penalty_weight if step >= flags.penalty_anneal_iters else 0.01) loss += penalty_weight * train_penalty if penalty_weight > 1.0: # Rescale the entire loss to keep gradients in a reasonable range loss /= penalty_weight # update weights of classifier grads = tape_src.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) if step % 1 == 0: pretty_print('epoch', 'train nll', 'train acc', 'test acc')
def radial_flow_modified(self, z, flow_params, num_flows, n_latent_dim, invert_condition=True): """ Created on 12-Aug-2017 """ z0s, alphas, betas = flow_params print "z0s shape:", z0s.get_shape() print "alphas shape:", alphas.get_shape() print "betas shape:", betas.get_shape() log_detjs = [] if num_flows == 0: # f_z = z sum_logdet_jacobian = tf.Variable(0.0, dtype=tf.float32) else: for k in range(num_flows): # z0, alpha, beta = z0s[:, k*Z:(k+1)*Z], alphas[:, k*Z:(k+1)*Z], betas[:, k] z0, alpha, beta = z0s[:, k * n_latent_dim:(k + 1) * n_latent_dim], \ alphas[:, k], betas[:, k] print "z0 shape", z0.get_shape() print "alpha shape", alpha.get_shape() print "beta shape", beta.get_shape() if invert_condition: # m(x)= log(1 + exp(x)) where x= w'*u. Last equation in A.2 Radial Flows. m_of_beta = self.softplus(beta) print "m_of_beta", m_of_beta.get_shape() print "alpha", alpha.get_shape() beta_hat = -alpha + m_of_beta # It's a scalar. print "beta_hat", beta_hat.get_shape() else: beta_hat = beta print "beta_hat", beta_hat.get_shape() # beta_hat = tf.expand_dims(beta_hat,1) # Distance of each data point from z0 # dist = (z - z0) ** 2 # dist = tf.reduce_sum(dist, 1) # r = tf.sqrt(dist) r = tf.norm((z - z0), ord='euclidean', axis=1) # r= tf.sqrt(np.sum(((self.z-self.z0)**2),1)) # m_of_beta = self.softplus(self.beta) # m(x)= log(1 + exp(x)) where x= w'*u. Last equation in A.2 Radial Flows. # beta_hat = -self.alpha + m_of_beta # It's a scalar. h_alpha_r = self.get_h( r, alpha) # Argument of h(.) in equation 14. (1000000,) print "beta_hat", beta_hat.get_shape() beta_h_alpha_r = beta_hat * h_alpha_r print "beta_h_alpha_r", beta_h_alpha_r.get_shape() # fz = self.z + beta_hat * tf.mul(tf.transpose(tf.expand_dims(h_alpha_r, 1)), # (self.z-self.z0)) # print "h_alpha_r shape", tf.expand_dims(h_alpha_r,1).get_shape() # z = z + beta_hat * tf.multiply((z-z0), h_alpha_r) # z = z + tf.multiply(tf.multiply((z-z0), h_alpha_r), beta_hat) # z = z + tf.multiply(tf.multiply((z - z0), tf.expand_dims(h_alpha_r, 1)), tf.expand_dims(beta_hat, 1)) z = z + tf.multiply( (z - z0), tf.expand_dims(beta_h_alpha_r, 1)) # print "z shape", z.get_shape() # Calculation of log det jacobian h_derivative_alpha_r = self.get_derivative_h(r, alpha) beta_h_derivative_alpha_r = beta_hat * h_derivative_alpha_r # logdet_jacobian = tf.log(1e-6 + tf.multiply(((1 + beta_h_alpha_r) ** (n_latent_dim - 1)), # (1 + h_derivative_alpha_r * r + beta_h_alpha_r))) logdet_jacobian = tf.log( 1e-6 + ((1.0 + beta_h_alpha_r)**(n_latent_dim - 1)) * (1.0 + beta_h_alpha_r + beta_h_derivative_alpha_r * r)) log_detjs.append(tf.expand_dims(logdet_jacobian, 1)) logdet_jacobian = tf.concat(log_detjs[0:num_flows + 1], axis=1) sum_logdet_jacobian = tf.reduce_sum(logdet_jacobian, axis=1) return z, sum_logdet_jacobian
def unit(vector): return tf.convert_to_tensor(vector) / tf.norm(vector)
def model_fn(self, features, labels, mode, params, config): image = features['image'] batch_size = tf.shape(image)[0] training = (mode == tf.estimator.ModeKeys.TRAIN) # Generator self._generator = layers.Segment(self.generative_network(params), name="generator") code = tf.random_uniform(shape=(batch_size, ) + self._code_shape, minval=-1., maxval=1., dtype=tf.float32) synthetic = tf.nn.sigmoid( self._generator.apply(code, training=training)) synthetic_ng = tf.stop_gradient(synthetic) epsilon = tf.random_uniform(shape=(), minval=0, maxval=1., dtype=tf.float32) synthmix = epsilon * image + (1 - epsilon) * synthetic_ng # Critic self._critic = layers.Segment(self.critic_network(params), name="critic") f_synth = self._critic.apply(synthetic, training=training) f_synth_ng = self._critic.apply(synthetic_ng, training=training) f_data = self._critic.apply(image, training=training) f_mix = self._critic.apply(synthmix, training=training) f_grad = tf.gradients(f_mix, synthmix) # Autoencoder if self._autoencoder: self._encoder = layers.Segment(self.encoder_network(params), name="encoder") code_ae = self._encoder.apply(synthetic, training=training) # Losses loss_wgan = tf.reduce_mean(f_data - f_synth) loss_ae = tf.constant(0, dtype=tf.float32) if self._autoencoder: loss_ae = tf.nn.l2_loss(code - code_ae) / tf.cast(batch_size, dtype=tf.float32) loss_crit = -tf.reduce_mean(f_data - f_synth_ng) loss_lip = tf.square(tf.norm(f_grad, ord=2) - 1) # loss_lip = sum([tf.square(tf.nn.relu(tf.nn.l2_loss(w) - 2)) # for l in self._critic.layers for w in l.variables]) alpha = tf.exp(-1 * tf.stop_gradient(loss_lip)) loss = alpha * (0.2 * loss_wgan + loss_crit) + 10 * loss_lip loss += sum([l for l in self._generator.losses]) loss += sum([l for l in self._classifier.losses]) if self._encoder: loss += sum([l for l in self._encoder.losses]) if mode == tf.estimator.ModeKeys.PREDICT: return self.prediction_estimator_spec(image, code, synthetic, params, config) tf.summary.scalar('loss/wgan', loss_wgan) tf.summary.scalar('loss/lip', loss_lip) tf.summary.scalar('loss/ae', loss_ae) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: return self.training_estimator_spec(loss, image, code, synthetic, params, config) else: return self.evaluation_estimator_spec(loss, image, code, synthetic, params, config)
def Loss(self,Newvar_D, Newvar_lbda): arg=Newvar_D, Newvar_lbda p = self.wass_grad(arg) p=tf.math.exp(p) #p=p/tf.math.reduce_sum(p) return tf.norm( self.datapoint-p, ord='euclidean')**2*1/2*1000 #tf.math.reduce_sum(p*tf.math.log(p/self.datapoint - p + self.datapoint))#tf.math.reduce_sum((self.datapoint-p)**2)*1/2
def accuracy(output,label): return tf.norm(output - label)/tf.norm(label)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.005, max_gradient_norm=5.0, param_da=150, param_r=10, model_type=0, attention=True): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # shape: [batch, length] #todo: implement placeholders self.texts_length = tf.placeholder(tf.int32, (None, ), 'texts_length') # shape: [batch] self.labels = tf.placeholder(tf.int64, (None, ), 'labels') # shape: [batch] self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, name="in_table", checkpoint=True) batch_size = tf.shape(self.texts)[0] # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup( self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) # todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #shape: [batch, length, num_embed_units] # todo: implement 3 RNNCells (BasicRNNCell, GRUCell, BasicLSTMCell) in a multi-layer setting with #num_units neurons and #num_layers layers if model_type == 2: rnn_model = GRUCell elif model_type == 1: rnn_model = BasicLSTMCell else: rnn_model = BasicRNNCell cell_fw = MultiRNNCell( [rnn_model(num_units) for i in range(num_layers)]) cell_bw = MultiRNNCell( [rnn_model(num_units) for i in range(num_layers)]) # todo: implement bidirectional RNN outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") H = tf.concat(outputs, 2) # shape: (batch, length, 2*num_units) with tf.variable_scope('logits'): if attention: # todo: implement self-attention mechanism, feel free to add codes to calculate temporary results Ws1 = tf.get_variable("Ws1", [param_da, 2 * num_units]) Ws2 = tf.get_variable("Ws2", [param_r, param_da]) A = tf.nn.softmax( tf.matmul(Ws2, tf.nn.tanh(tf.matmul(Ws1, H, transpose_b=True)))) M = tf.matmul(A, H) # shape: [batch, param_r, 2*num_units] flatten_M = tf.reshape( M, shape=[batch_size, param_r * 2 * num_units ]) # shape: [batch, param_r*2*num_units] logits = tf.layers.dense( flatten_M, num_labels, activation=None, name='projection') # shape: [batch, num_labels] else: M = tf.reduce_mean(H, axis=1) flatten_H = tf.reshape(M, shape=[batch_size, 2 * num_units]) logits = tf.layers.dense(flatten_H, num_labels, activation=None, name='projection') # todo: calculate additional loss, feel free to add codes to calculate temporary results identity = tf.reshape( tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]), [batch_size, param_r, param_r]) if attention: self.penalized_term = tf.norm( tf.matmul(A, A, transpose_b=True) - identity) self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.labels, logits=logits), name='loss') + 0.0001 * self.penalized_term else: self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.labels, logits=logits), name='loss') predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=6, pad_step_number=True)
distance1 = tf.reduce_mean( tf.sqrt( tf.reduce_sum(tf.pow(tf.subtract(fc_featsx, fc_featsy1), 2), 1, keep_dims=True))) distance2 = tf.reduce_mean( tf.sqrt( tf.reduce_sum(tf.pow(tf.subtract(fc_featsx, fc_featsy2), 2), 1, keep_dims=True))) loss0 = tf.maximum(distance1 - distance2 + alpha, 0) #loss1 = tf.reduce_mean(tf.norm(fc_featsx)) #loss2 = tf.reduce_mean(tf.norm(fc_featsy1)) + tf.reduce_mean(tf.norm(fc_featsy2)) loss3a = tf.reduce_mean(tf.norm(tf.subtract(fc_featsx, fc_featsy1))) #loss3b = tf.reduce_mean(tf.norm(tf.subtract(fc_featsx,fc_featsy2))) loss3 = loss3a loss4 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=predx, labels=ly1)) loss5 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=predy1, labels=ly1)) #loss6 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predy2, labels=ly2)) #loss4 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pox,labels=lx)) #loss5 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=poy1, labels=ly1)) #loss6 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=poy2, labels=ly2)) #loss6 = tf.norm(tf.subtract(tf.norm(weightsx['wd1x']),1)) #loss7 = tf.norm(tf.subtract(tf.norm(weightsy['wd1y']),1)) #loss8 = tf.norm(tf.subtract(tf.norm(weightsz['wd1z']),1)) loss9 = tf.reduce_mean(tf.norm(tf.subtract(x, duy1))) + tf.reduce_mean( tf.norm(tf.subtract(y1, dux)))
def test_nets_and_update(env, config): tf.reset_default_graph() model = DQNquantie(env, config) # inject test data s = tf.ones([1, 80, 80, 4], dtype=tf.float32) sp = tf.ones([1, 80, 80, 4], dtype=tf.float32) # create q_test and target_q_test q_test = model.get_q_values_op(s, scope="q_test", reuse=False) target_q_test = model.get_q_values_op( sp, scope="target_q_test", reuse=False) # create update_op q_test_var_lst = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "q_test") target_q_test_var_lst = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "target_q_test") update_target_op = model.add_update_target_op("q_test", "target_q_test") assert len(q_test_var_lst) == len(target_q_test_var_lst), \ "number of variables in q and target_q differ" # main logic of the test sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) distance_before_lst = [] # check difference before the update # NOTE: checking difference by checking the # Euclidean distance for idx in range(len(target_q_test_var_lst)): # skip bias, since they are intialized with 0's if 'bias' in q_test_var_lst[idx].name: continue distance_np = (sess.run(tf.norm( q_test_var_lst[idx] - target_q_test_var_lst[idx] ))) distance_before_lst.append(distance_np) assert np.mean(distance_before_lst) != 0., \ 'q and taget_q initialized with the same weights' # perform update sess.run(update_target_op) # check difference after the update distance_after_lst = [] for idx in range(len(target_q_test_var_lst)): # skip bias, since they are intialized with 0's if 'bias' in q_test_var_lst[idx].name: continue distance_np = (sess.run(tf.norm( q_test_var_lst[idx] - target_q_test_var_lst[idx] ))) distance_after_lst.append(distance_np) assert np.mean(distance_after_lst) == 0., \ 'network creation and update test failed' print(" -- network creation and update test passed")
def my_segcap(images, is_train, size, l2_reg): is_training = True start_s = 2 atom = 16 routing = 3 end_points = OrderedDict() # 1 (128 -> 128) L1_conv1 = conv(images, filters=atom, kernel_size=[1, 1], l2_reg_scale=l2_reg, batchnorm_istraining=is_training) conv_prime = tf.expand_dims(L1_conv1, axis=3) # [N, H, W, t=1, z] # 1/2 (128 -> 64) multiple = 1 L2_cap1_1 = residual_cap_block(conv_prime, routing=routing) L3_cap1_2 = capsule(L2_cap1_1, "conv", k=3, s=2, t=start_s * multiple, z=atom, routing=routing) skip1 = L2_cap1_1 # 1/4 (64 -> 32) multiple = 2 L4_cap2_1 = residual_cap_block(L3_cap1_2, routing=routing) L5_cap2_2 = capsule(L4_cap2_1, "conv", k=3, s=2, t=start_s * multiple, z=atom, routing=routing) skip2 = L4_cap2_1 #middle (16 -> 16) L6_cap_m_1 = residual_cap_block(L5_cap2_2, routing=routing) L7_cap_m_2 = residual_cap_block(L6_cap_m_1, routing=routing) # 1/4 (32 -> 64) multiple = 2 L8_u_cap2_1 = capsule(L7_cap_m_2, "deconv", k=3, s=2, t=start_s * multiple, z=atom, routing=routing) u_cap_concat_2 = tf.concat([L8_u_cap2_1, skip2], axis=3) L9_u_cap2_3 = residual_cap_block(u_cap_concat_2, routing=routing) # 1/2 (64 -> 128) multiple = 1 L10_u_cap3_1 = capsule(L9_u_cap2_3, "deconv", k=3, s=2, t=start_s * multiple, z=atom, routing=routing) u_cap_concat_3 = tf.concat([L10_u_cap3_1, skip1], axis=3) L11_u_cap3_2 = capsule(u_cap_concat_3, "conv", k=3, s=1, t=start_s * multiple, z=atom * 4, routing=routing) L12_u_cap3_3 = residual_cap_block(L11_u_cap3_2, routing=routing) L13_u_cap3_4 = residual_cap_block(L12_u_cap3_3, routing=routing) L14_u_cap3_5 = capsule(L13_u_cap3_4, "conv", k=3, s=1, t=1, z=atom, routing=routing) # L14_u_cap3_5_l_list =tf.split(L14_u_cap3_5,num_or_size_splits=atom,axis=4) # L14_u_cap3_5_l_add = tf.add_n(L14_u_cap3_5_l_list) # predict = tf.squeeze(L14_u_cap3_5_l_add, axis=[4]) # tf.norm默认为Frobenius范数,简称F - 范数,是一种矩阵范数,记为 | |· | | F。 # 矩阵A的Frobenius范数定义为矩阵A各项元素的绝对值平方的总和 predict = tf.norm(L14_u_cap3_5, axis=-1) predict = bn(predict, is_training) # tf.squeeze() # 1 (128 -> 128) # u_cap_concat_4=cap_out_1 # [N, H_1, W_1, t_1, z_1] =u_cap_concat_4.get_shape() # u_cap_concat_4 = tf.reshape(u_cap_concat_4, [N, H_1, W_1, 1,t_1* z_1]) #普通输出层 # cap_out_4 = tf.squeeze(u_cap_concat_4, axis=3) # cap_out_7 =conv(cap_out_4, filters=24, kernel_size=[1,1],l2_reg_scale=l2_reg, batchnorm_istraining=is_training) # cap_out_8 =conv(cap_out_7, filters=1, kernel_size=[1,1],l2_reg_scale=l2_reg, batchnorm_istraining=is_training) # cap_out_9 = bn(cap_out_8, is_training) ################ end_points ########################## #用于输出可视化中间层 end_points['L1_conv1'] = L1_conv1 #Layer 1 end_points['L2_cap1_1'] = L2_cap1_1 #Layer 2 skip1 end_points['L3_cap1_2'] = L3_cap1_2 #Layer 3 end_points['L4_cap2_1'] = L4_cap2_1 #Layer 4 skip2 end_points['L5_cap2_2'] = L5_cap2_2 #Layer 5 end_points['L6_cap_m_1'] = L6_cap_m_1 #Layer 6 end_points['L7_cap_m_2'] = L7_cap_m_2 #Layer 7 end_points['L8_u_cap2_1'] = L8_u_cap2_1 #Layer 8 skip2 end_points['L9_u_cap2_3'] = L9_u_cap2_3 #Layer 9 end_points['L10_u_cap3_1'] = L10_u_cap3_1 #Layer 10 skip1 end_points['L11_u_cap3_2'] = L11_u_cap3_2 #Layer 11 end_points['L12_u_cap3_3'] = L12_u_cap3_3 #Layer 12 end_points['L13_u_cap3_4'] = L13_u_cap3_4 #Layer 13 end_points['L14_u_cap3_5'] = L14_u_cap3_5 #Layer 14 end_points['predict'] = predict #Layer 15 ################ end ########################### return predict, end_points
def robust_norm(x): x = x + 1e-8 a = tf.reduce_max(tf.abs(x), axis=2, keep_dims=True) return tf.squeeze(a, [2]) * tf.norm(x / a, axis=2)
def test_NN(self, net, record_path=None, save_name=None): if record_path is not None: folderpath = record_path record_path = record_path + "rel_errs2.csv" if os.path.exists(record_path): pass else: with open(record_path, mode='w') as record: fields = [ 'Problem', 'Net_struct', 'Net_setup', 'Sample', 'L', 'relative_err', 'save_name' ] record_writer = csv.writer(record, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) record_writer.writerow(fields) X0_dict, u_test = self.u_exact_test() x_tf = X0_dict["x_tf"] y_tf = X0_dict["y_tf"] t_tf = X0_dict["t_tf"] xi_tf = X0_dict["xi_tf"] target_f = tf.zeros([self.N * self.N_p_test, 1]) if self.sampling_method == 3: net.h_init = tf.constant(self.h_init, dtype=tf.float32) u_test_p = net.forward(x_tf, y_tf, t_tf, xi_tf) f_res = net.compute_residual(x_tf, y_tf, t_tf, xi_tf, target_f) if self.sampling_method == 0: u_test_p = u_test_p.numpy() self.V = np.load(self.path_env + "V_{}.npy".format(self.L)) u_test_p = u_test_p @ self.V.T u_test_p_grid = tf.constant(u_test_p, dtype=tf.float32) u_test_grid = tf.constant(u_test.T, dtype=tf.float32) elif self.sampling_method == 1 or self.sampling_method == 2: N_record = [self.Nf, self.Nb, self.Nn, self.N0] u_test_grid = tf.reshape(u_test, (self.N_p_test, self.N)) u_test_p_grid = tf.reshape(u_test_p, (self.N_p_test, self.N)) f_res_grid = tf.reshape(f_res, (self.N_p_test, self.N)) if self.sampling_method == 3: u_test_p = u_test_p.numpy() u_test_p_grid = tf.constant(u_test_p, dtype=tf.float32) u_test_grid = tf.constant(u_test.T, dtype=tf.float32) f_res_grid = None err_grid = u_test_grid - u_test_p_grid err_test = tf.math.reduce_mean(tf.square(err_grid)) relative_err_vec = tf.norm(err_grid, axis=1) / tf.norm(u_test_grid, axis=1) rel_err_test = tf.reduce_mean(relative_err_vec) if record_path is not None: # y_tf = tf.constant((),shape = (len(self.x),0),dtype = tf.float32) # t_tf = tf.constant((),shape = (len(self.x),0),dtype = tf.float32) # x_tf = tf.constant(self.x.reshape((len(self.x),1)),dtype = tf.float32) # xi_tf = tf.constant(1e-4*np.ones((len(self.x),1)),dtype = tf.float32) # u_test_p = net.forward(x_tf, y_tf, t_tf, xi_tf) list_info = [ self.name, net.name, net.layers, N_record, self.L, rel_err_test.numpy(), save_name ] # scipy.io.savemat(folderpath+"/{0}.mat".format(N_record), {'approx':u_test_p.numpy()}) with open(record_path, 'a') as f: writer = csv.writer(f) writer.writerow(list_info) print("Test average error is: {0}\nRelative error is: {1}".format( err_test.numpy(), rel_err_test.numpy())) return u_test_grid, u_test_p_grid, err_test, rel_err_test, f_res_grid
def __init__(self, name, inputs, targets, n_classes, n_features, tower_setup, imgs_raw=None, original_labels=None, activation="linear", dropout=0.0, batch_norm=False, batch_norm_decay=BATCH_NORM_DECAY_DEFAULT, l2=L2_DEFAULT, negative_weighting_factor=1): super(FullyConnectedWithTripletLoss, self).__init__() self.measures = {} inp, n_features_inp = prepare_collapsed_input_and_dropout( inputs, dropout) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): if batch_norm: inp = tf.expand_dims(inp, axis=0) inp = tf.expand_dims(inp, axis=0) inp = self.create_and_apply_batch_norm(inp, n_features_inp, batch_norm_decay, tower_setup) inp = tf.squeeze(inp, axis=[0, 1]) W = self.create_weight_variable("W", [n_features_inp, n_features], l2, tower_setup) b = self.create_bias_variable("b", [n_features], tower_setup) z = tf.matmul(inp, W) + b h = get_activation(activation)(z) self.outputs = [h] if original_labels is not None: self.measures[Constants.EMBEDDING] = [h] self.measures[Constants.ORIGINAL_LABELS] = [original_labels] self.add_scalar_summary(tf.norm(h[0]), "embedding_norm") self.summaries.append(tf.summary.histogram("embedding", h)) size = smart_shape(h)[0] eps = 1e-10 # New print debug example def my_print(x, name): with tf.control_dependencies([ tf.assert_equal( tf.reduce_all(tf.greater(tf.shape(x), 0)), True) ]): if x.dtype in (tf.float32, tf.float64): with tf.control_dependencies([ tf.assert_equal(tf.reduce_all(tf.is_finite(x)), True) ]): return tf.Print(x, [ tf.shape(x), tf.reduce_all(tf.is_finite(x)), x ], name, summarize=200) else: return tf.Print(x, [tf.shape(x), x], name) def get_loss(idx): anchor = h[idx, :] anchor_class = targets[idx] ###### New code ###### class_division = tf.equal(targets, anchor_class) not_self_mask = tf.logical_not( tf.cast(tf.one_hot(idx, depth=size), tf.bool)) positive_output = tf.boolean_mask( h, tf.logical_and(class_division, not_self_mask)) negative_output = tf.boolean_mask( h, tf.logical_not(class_division)) # negative_output = tf.boolean_mask(h, tf.logical_and(tf.logical_not(class_division),not_self_mask)) # positive_output = my_print(positive_output,"positive_output") # negative_output = my_print(negative_output, "negative_output") positive_distances = tf.abs(anchor - positive_output) pos_dis_val = tf.norm(positive_distances + eps, axis=1) hardest_positive, hardest_positive_idx = tf.nn.top_k( pos_dis_val, 1) negative_distances = tf.abs(anchor - negative_output) neg_dis_val = tf.norm(negative_distances + eps, axis=1) minus_neg_dis_val = tf.negative(neg_dis_val) # minus_neg_dis_val = tf.Print(minus_neg_dis_val,[minus_neg_dis_val]) # minus_neg_dis_val = tf.Print(minus_neg_dis_val, [minus_neg_dis_val.shape]) minus_hardest_negative, hardest_negative_idx = tf.nn.top_k( minus_neg_dis_val, 1) hardest_negative = tf.negative(minus_hardest_negative) # minus_hardest_negative, hardest_negative_idx = tf.nn.top_k(minus_neg_dis_val, negative_weighting_factor) # hardest_negative = tf.negative(minus_hardest_negative) # hardest_negative = tf.reduce_sum(hardest_negative,-1) ###### Old code with dynamic partition ###### # class_division = tf.cast(tf.equal(targets, anchor_class), tf.int32) # not_self_mask = tf.logical_not(tf.cast(tf.one_hot(idx, depth=size), tf.bool)) # partitioned_output = tf.dynamic_partition(h, class_division, 2) # positive_output = partitioned_output[1] # negative_output = partitioned_output[0] # class_division = tf.equal(targets, anchor_class) # not_self_mask = tf.logical_not(tf.cast(tf.one_hot(idx, depth=size),tf.bool)) # positive_output = tf.boolean_mask(h, tf.logical_and(class_division, not_self_mask)) # negative_output = tf.boolean_mask(h, tf.logical_not(class_division)) # # # positive_distances = tf.abs(anchor - positive_output) # pos_dis_val = tf.norm(positive_distances+eps, axis=1) # hardest_positive_idx = tf.argmax(pos_dis_val,0) # pos_div_size = smart_shape(positive_output)[0] # pos_divider = tf.one_hot(hardest_positive_idx,pos_div_size,dtype=tf.int32) # hardest_positive = tf.dynamic_partition(positive_distances,pos_divider,2)[1] # hardest_positive_class = tf.gather(targets, hardest_positive_idx) # hardest_positive = tf.norm(hardest_positive+eps, axis=1) # # negative_distances = tf.abs(anchor - negative_output) # neg_dis_val = tf.norm(negative_distances+eps, axis=1) # hardest_negative_idx = tf.argmin(neg_dis_val,0) # neg_div_size = smart_shape(negative_output)[0] # neg_divider = tf.one_hot(hardest_negative_idx,neg_div_size,dtype=tf.int32) # hardest_negative = tf.dynamic_partition(negative_distances,neg_divider,2)[1] # hardest_negative_class = tf.gather(targets,hardest_negative_idx) # hardest_negative = tf.norm(hardest_negative+eps, axis=1) # hardest_positive = my_print(hardest_positive,"hardest_positive") # hardest_negative = my_print(hardest_negative,"hardest_negative") #### Next two lines should be the same loss = tf.nn.softplus(hardest_positive - hardest_negative) # loss = tf.nn.softplus(hardest_positive - negative_weighting_factor*hardest_negative) # loss = tf.log1p(tf.exp(hardest_positive - hardest_negative)) #### Code for using a hard margin rather than a softmargin # margin = 1 # loss = tf.maximum(0., margin + hardest_positive - hardest_negative) anchor_img = tf.zeros([], tf.float32) hard_pos_img = tf.zeros([], tf.float32) hard_neg_img = tf.zeros([], tf.float32) if imgs_raw is not None: positive_images = tf.boolean_mask( imgs_raw, tf.logical_and(class_division, not_self_mask)) negative_images = tf.boolean_mask( imgs_raw, tf.logical_not(class_division)) anchor_img = imgs_raw[idx] hard_pos_img = positive_images[tf.squeeze( hardest_positive_idx)] hard_neg_img = negative_images[tf.squeeze( hardest_negative_idx)] # self.summaries.append(tf.summary.image("anchor_image", imgs_raw[idx])) # positive_images = tf.squeeze(tf.boolean_mask(imgs_raw, tf.logical_and(class_division, not_self_mask))) # negative_images = tf.squeeze(tf.boolean_mask(imgs_raw, tf.logical_not(class_division))) # self.summaries.append(tf.summary.image("hardest_postive_image",positive_images[hardest_positive_idx])) # self.summaries.append(tf.summary.image("hardest_negative_image", negative_images[hardest_negative_idx])) return loss, hardest_positive, hardest_negative, anchor_img, hard_pos_img, hard_neg_img #### Next two lines should be the same loss, hardest_positive, hardest_negative, anchor_imgs, hard_pos_imgs, hard_neg_imgs = \ tf.map_fn(get_loss, tf.range(0, size), dtype=(tf.float32,tf.float32,tf.float32, tf.float32, tf.float32, tf.float32)) # loss, hardest_positive, hardest_negative = [get_loss(idx) for idx in xrange(size)] self.loss = tf.reduce_sum(loss) hardest_positive = tf.reduce_sum(hardest_positive) hardest_negative = tf.reduce_sum(hardest_negative) self.add_scalar_summary(self.loss, "loss") self.add_scalar_summary(hardest_positive, "hardest_positive") self.add_scalar_summary(hardest_negative, "hardest_negative") # tf.summary.image() self.n_features = n_features if imgs_raw is not None: self.summaries.append( tf.summary.image("anchor_image", anchor_imgs)) self.summaries.append( tf.summary.image("hardest_postive_image", hard_pos_imgs)) self.summaries.append( tf.summary.image("hardest_negative_image", hard_neg_imgs))
def discriminative_loss_single(prediction, correct_label, feature_dim, label_shape, delta_v, delta_d, param_var, param_dist, param_reg): """ 论文equ(1)提到的实例分割损失函数 :param prediction: inference of network :param correct_label: instance label :param feature_dim: feature dimension of prediction :param label_shape: shape of label :param delta_v: cutoff variance distance :param delta_d: curoff cluster distance :param param_var: weight for intra cluster variance :param param_dist: weight for inter cluster distances :param param_reg: weight regularization """ # 像素对齐为一行 correct_label = tf.reshape(correct_label, [label_shape[1] * label_shape[0]]) reshaped_pred = tf.reshape(prediction, [label_shape[1] * label_shape[0], feature_dim]) # 统计实例个数 unique_labels, unique_id, counts = tf.unique_with_counts(correct_label) counts = tf.cast(counts, tf.float32) num_instances = tf.size(unique_labels) # 计算pixel embedding均值向量 segmented_sum = tf.unsorted_segment_sum(reshaped_pred, unique_id, num_instances) mu = tf.div(segmented_sum, tf.reshape(counts, (-1, 1))) mu_expand = tf.gather(mu, unique_id) # 计算公式的loss(var) distance = tf.norm(tf.subtract(mu_expand, reshaped_pred), axis=1) distance = tf.subtract(distance, delta_v) distance = tf.clip_by_value(distance, 0., distance) distance = tf.square(distance) l_var = tf.unsorted_segment_sum(distance, unique_id, num_instances) l_var = tf.div(l_var, counts) l_var = tf.reduce_sum(l_var) l_var = tf.divide(l_var, tf.cast(num_instances, tf.float32)) # 计算公式的loss(dist) mu_interleaved_rep = tf.tile(mu, [num_instances, 1]) mu_band_rep = tf.tile(mu, [1, num_instances]) mu_band_rep = tf.reshape(mu_band_rep, (num_instances * num_instances, feature_dim)) mu_diff = tf.subtract(mu_band_rep, mu_interleaved_rep) # 去除掩模上的零点 intermediate_tensor = tf.reduce_sum(tf.abs(mu_diff), axis=1) zero_vector = tf.zeros(1, dtype=tf.float32) bool_mask = tf.not_equal(intermediate_tensor, zero_vector) mu_diff_bool = tf.boolean_mask(mu_diff, bool_mask) mu_norm = tf.norm(mu_diff_bool, axis=1) mu_norm = tf.subtract(2. * delta_d, mu_norm) mu_norm = tf.clip_by_value(mu_norm, 0., mu_norm) mu_norm = tf.square(mu_norm) l_dist = tf.reduce_mean(mu_norm) # 计算原始Discriminative Loss论文中提到的正则项损失 l_reg = tf.reduce_mean(tf.norm(mu, axis=1)) # 合并损失按照原始Discriminative Loss论文中提到的参数合并 param_scale = 1. l_var = param_var * l_var l_dist = param_dist * l_dist l_reg = param_reg * l_reg loss = param_scale * (l_var + l_dist + l_reg) return loss, l_var, l_dist, l_reg
def trainModelWithCSV(run_name, layer_sizes, training_file_path, testing_file_path, initial_learning_rate, learning_rate_decay, num_epochs, batch_size, regularization_parameter, save_model=False): # Code to reset the tensorflow graph & make tensorflow release VRAM after it's done computing tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True #Importing the data from the specified paths. Can be changed to adjust for what you want to compute X_training, Y_training = readInputsAndOutputs(training_file_path) X_testing, Y_testing = readInputsAndOutputs(testing_file_path) #Calculating the number of batches based on the batchsize specified m = X_training.shape[0] if m // batch_size == m / batch_size: num_batches = m // batch_size else: num_batches = m // batch_size + 1 #Dictionaries that are used to save intermediate values of the graph a = dict() z = dict() weights = dict() biases = dict() a_normalized = dict() #Defines the model according to the layer sizes specified for index, layer_size in enumerate(layer_sizes): if index == 0: #Creates placeholder for the X's. Adds that value to a dictionary for easier computing afterwards with tf.variable_scope('input'): X = tf.placeholder(dtype=tf.float32, shape=[None, layer_size], name='X') a_normalized['a_normalized0'] = X else: #Defines computations for each layer in the model with tf.variable_scope('layer' + str(index)): #Initializes weights using Xavier Initialization weights['w' + str(index)] = tf.get_variable( name='weights' + str(index), dtype=tf.float32, shape=[layer_sizes[index - 1], layer_sizes[index]], initializer=tf.contrib.layers.xavier_initializer()) #Initializes biases to 0 biases['b' + str(index)] = tf.get_variable( name='biases' + str(index), dtype=tf.float32, shape=[layer_sizes[index]], initializer=tf.zeros_initializer()) #Computes the linear activation z['z' + str(index)] = tf.matmul( a_normalized['a_normalized' + str(index - 1)], weights['w' + str(index)]) + biases['b' + str(index)] #Computes the non-linear activation for all layers except for the last one if index != len(layer_sizes) - 1: a['a' + str(index)] = tf.nn.relu(z['z' + str(index)]) a_normalized['a_normalized' + str(index)] = tf.layers.batch_normalization( inputs=a['a' + str(index)], axis=1) # Activation of the last layer. Can be changed according what you want to predict else: outputs = tf.nn.softmax(logits=z['z' + str(index)]) #Computes the sum of frobenius norm of all the weights matrixes weights_squarred_sum = 0 for index in range(1, len(layer_sizes)): weights_squarred_sum += tf.norm(weights["w" + str(index)], ord='fro', axis=[-2, -1]) #Defines the cost function. Change according to last layer's activation. Additional calculations for regularization with tf.variable_scope('cost'): Y = tf.placeholder(dtype=tf.float32, shape=(None, layer_sizes[len(layer_sizes) - 1]), name='Y') cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=z['z' + str(len(layer_sizes) - 1)], labels=Y) + regularization_parameter / (2 * m) * weights_squarred_sum) #Defines optimizer with tf.variable_scope('optimizer'): learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(cost) #Object used to log cost's across runs and epochs. Used later with tf.variable_scope('logging'): tf.summary.scalar(name='cost', tensor=cost) summary = tf.summary.merge_all() #If specified, saves model. Used later if save_model: saver = tf.train.Saver() #Starts a session with tf.Session(config=config) as session: #Initializes all the variables (weights and biases) session.run(tf.global_variables_initializer()) #Objects used to write log files for training and testing costs training_writer = tf.summary.FileWriter( "./logs/" + run_name + "/training", session.graph) testing_writer = tf.summary.FileWriter( "./logs/" + run_name + "/testing", session.graph) #Training loop running according to the specified number of epochs for epoch in range(num_epochs): for batch in range(num_batches): #Selecting batch to run optimizer on X_training_batch = X_training[batch * batch_size:(batch + 1) * batch_size, :] Y_training_batch = Y_training[batch * batch_size:(batch + 1) * batch_size, :] #Runs one step of the Adam optimizer for every batch session.run( [optimizer], feed_dict={ X: X_training_batch, Y: Y_training_batch, learning_rate: initial_learning_rate / (1 + learning_rate_decay * epoch) }) #Logs training and testing costs every 5 epochs if epoch % 5 == 0: training_cost, training_summary = session.run([cost, summary], feed_dict={ X: X_training, Y: Y_training }) testing_cost, testing_summary = session.run([cost, summary], feed_dict={ X: X_testing, Y: Y_testing }) print("Epoch #" + str(epoch) + ": training cost= " + str(training_cost) + " testing cost= " + str(testing_cost)) training_writer.add_summary(training_summary, epoch) testing_writer.add_summary(testing_summary, epoch) #Display percentage of accurate predictions predictions = session.run(outputs, feed_dict={X: X_testing}) expected = np.argmax(Y_testing, axis=1) predictions = np.argmax(predictions, axis=1) correct = 0 for index in range(len(predictions)): if predictions[index] == expected[index]: correct += 1 print("Testing accuracy = " + str(correct / (len(predictions)) * 100) + "%") #If specified, saves model if save_model: saver.save(sess=session, save_path="./models/" + run_name + "/" + run_name + ".ckpt") f = open("./models/" + run_name + "/" + "layer_sizes.txt", "w+") f.write(str(layer_sizes)) f.close() return session.run(cost, feed_dict={ X: X_training, Y: Y_training }), session.run(cost, feed_dict={ X: X_testing, Y: Y_testing })
def cond(i, x, r, p): return tf.logical_and(i < MAX_ITER, tf.norm(r) > CG_EPS)
def predictUsingModelWithVectors(model_path, layer_sizes, X_input): # Code to reset the tensorflow graph & make tensorflow release VRAM after it's done computing tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True # Dictionaries that are used to save intermediate values of the graph a = dict() z = dict() weights = dict() biases = dict() a_normalized = dict() # Defines the model according to the layer sizes specified for index, layer_size in enumerate(layer_sizes): if index == 0: # Creates placeholder for the X's. Adds that value to a dictionary for easier computing afterwards with tf.variable_scope('input'): X = tf.placeholder(dtype=tf.float32, shape=[None, layer_size], name='X') a_normalized['a_normalized0'] = X else: # Defines computations for each layer in the model with tf.variable_scope('layer' + str(index)): # Initializes weights using Xavier Initialization weights['w' + str(index)] = tf.get_variable( name='weights' + str(index), dtype=tf.float32, shape=[layer_sizes[index - 1], layer_sizes[index]], initializer=tf.contrib.layers.xavier_initializer()) # Initializes biases to 0 biases['b' + str(index)] = tf.get_variable( name='biases' + str(index), dtype=tf.float32, shape=[layer_sizes[index]], initializer=tf.zeros_initializer()) # Computes the linear activation z['z' + str(index)] = tf.matmul( a_normalized['a_normalized' + str(index - 1)], weights['w' + str(index)]) + biases['b' + str(index)] # Computes the non-linear activation for all layers except for the last one if index != len(layer_sizes) - 1: a['a' + str(index)] = tf.nn.relu(z['z' + str(index)]) a_normalized['a_normalized' + str(index)] = tf.layers.batch_normalization( inputs=a['a' + str(index)], axis=1) # Activation of the last layer. Can be changed according what you want to predict else: outputs = tf.nn.softmax(logits=z['z' + str(index)]) # Computes the sum of frobenius norm of all the weights matrixes weights_squarred_sum = 0 for index in range(1, len(layer_sizes)): weights_squarred_sum += tf.norm(weights["w" + str(index)], ord='fro', axis=[-2, -1]) saver = tf.train.Saver() # Starts a session with tf.Session(config=config) as session: # Initializes all the variables (weights and biases) saver.restore(sess=session, save_path=model_path) #Compute predicitions for the inputs predicitons = session.run(outputs, feed_dict={X: X_input}) return np.argmax(predicitons, axis=1)
def cond(i, X, R_, R, V_): return tf.logical_and(i < MAX_ITER, tf.norm(R) > CG_EPS)
def optimize(self, task, target): """ Calculate the loss functions and optimize the weights """ # Get a list of all trainable variables variables_names = [v for v in tf.trainable_variables()] pol_list = [] val_list = [] for v in variables_names: # List of variables that should be optimized for the policy network ind = v.name.find('pol_') if ind != -1: # If the string pol is found in the name, this is a policy network variable pol_list.append(v) # List of variables that should be optimized for the value network ind = v.name.find('val_') if ind != -1: # If the string val is found in the name, this is a value network variable val_list.append(v) ''' Calculate the loss function dependent on the policy netwokrk parameters Equation (2) from Song et al., 2017 ''' pol_out = tf.nn.softmax(tf.stack(task.pol_out_history), 1) # Output of the policy network, a small amount added so log wouldn't get nan #pol_out = tf.stack(task.pol_out_history) NT = tf.stop_gradient(tf.reduce_sum(task.time_mask)) # Total # of included time points # Calculate J (equation 22 bur also baseline will be subtracted): # 1) Discard reward at time points that are to be excluded #reward = tf.multiply(task.reward, task.time_mask) external_reward = tf.stop_gradient(tf.multiply(task.reward, task.time_mask)) # This is the reward value given by the environment self.external_reward = external_reward time_mask = tf.stop_gradient(task.time_mask) # 2) Apply discount (Page 17, Song et al., 2017) baseline = tf.stop_gradient(tf.multiply(task.val_out_history, time_mask)) self.baseline = baseline # Calculate discounted future reward per Song et al.,2 017 Mcausal = np.zeros((par['num_time_steps'], par['num_time_steps'])) for i in range(par['num_time_steps']): # Mcausal[i,i:] = 1 # No discount version Mcausal[i,i:] = np.exp(-np.arange(par['num_time_steps']-i)/(par['discount_time_constant']//par['dt'])) # Add discount, 100ms (10 steps) works #pdb.set_trace() advantage = tf.matmul(Mcausal, external_reward) - baseline ''' # Advantage based on Nick and Greg's code Vt = baseline[:-1, :] # Vt will have all baseline values but the last one Vtnext = baseline[1:, :] # Vt+1 will have all baseline values but the first one advantage = external_reward[:-1, :] + par['discount_coef']*Vtnext - Vt ''' self.advantage = advantage action_array = tf.stop_gradient(task.action_array) # 3) Multiply reward by logpi to get the first term in J (i.e. reward portion) logpi = tf.multiply(pol_out, action_array) logpi = tf.log(tf.reduce_sum(logpi, axis=1)+1e-7) #tf.log(tf.reduce_sum(logpi, axis=0)) # logpi = logpi[:-1] # Discard last time point for some formulations task.logpi = logpi self.Jn = tf.reduce_sum(tf.multiply(advantage, task.logpi))/(NT - 1) #self.Jn = -tf.square(tf.stack(task.pol_out_history) - target) # Average Jn values to get average of J self.J = tf.reduce_mean(self.Jn) # 7) Calculate average regularization term (mentioned as Omega in equation 2) with tf.variable_scope('pol_rnn_cell', reuse=True): pol_W_rnn = tf.get_variable('pol_W_rnn', dtype=tf.float64) # Second norm of the recurrent weight loss, encourages sparse weights self.weight_loss_pol = par['weight_cost'] * tf.norm(pol_W_rnn, ord=2) / par['batch_train_size'] # Sum of firing rates squared (Second norm of their activity matrix), encourages sparse activity self.spike_loss_pol = par['spike_cost'] * tf.reduce_mean(tf.reduce_mean(tf.square(tf.stack(task.pol_r_history)), axis=2)) self.Omega_pol = 0*self.weight_loss_pol + self.spike_loss_pol # Caclulate entropy #pdb.set_trace() log_pol_out = tf.log(pol_out + 1e-7) # Log of output of the policy network # Multiply output and its log entropy = tf.multiply(pol_out, log_pol_out) #size: Ntime x 3 x Nbatch size # Sum over all the outputs entropy = tf.reduce_sum(entropy, axis=1) #size: Ntime x Nbatch size # Apply time mask entropy = tf.multiply(entropy, time_mask) # Sum across time entropy = tf.reduce_sum(entropy, axis=0) #size: Nbatch size # Average across trials entropy = -1*tf.reduce_sum(entropy)/NT self.entropy = entropy self.ent_pol_out = pol_out self.ent_log_pol_out = log_pol_out self.NT = NT # 8) Calculate the loss function for policy network (Equation 2) self.Loss_pol = -self.J + self.Omega_pol #- 0.00*self.entropy ''' Calculate the loss function dependent on the value netwokrk parameters Equation (4) from Song et al., 2017 ''' # 1) Calculate En (Equation 5) # Sum of squared of differences averaged across all time points # Applt the time mask to output of the value network val_out = tf.multiply(tf.stack(task.val_out_history), time_mask) # E will minimzie advantage, except instead of Vt, which is not differentiable, we use val_out which is the differentiable variable #self.En = tf.square(external_reward[:-1, :] + par['discount_coef']*Vtnext - val_out[:-1, :]) self.En = tf.square(val_out - advantage) # Average En values to get E self.E = tf.reduce_mean(self.En) # 2) Calculate Omega for the value network (mentioned in equation 4) # Set it to zero for now self.Omega_val = 0 # 3) Calculate loss for the value network (Equation 4) self.Loss_val = self.E + self.Omega_val """ Define optimizer, calculate and gradient the the value network """ val_opt = tf.train.AdamOptimizer(learning_rate = par['learning_rate']/10) """ Define optimizer, calculate and gradient the the policy network """ pol_opt = tf.train.AdamOptimizer(learning_rate = par['learning_rate']) self.pol_grads_and_vars = pol_opt.compute_gradients(self.Loss_pol, var_list = pol_list) self.val_grads_and_vars = val_opt.compute_gradients(self.Loss_val, var_list = val_list) self.pol_train_op = pol_opt.minimize(self.Loss_pol, var_list = pol_list) self.val_train_op = val_opt.minimize(self.Loss_val, var_list = val_list)
def __init__( self, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, reward_network: types.Network, optimizer: types.Optimizer, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, accepts_per_arm_features: bool = False, constraints: Iterable[constr.BaseConstraint] = (), # Params for training. error_loss_fn: types.LossFn = tf.compat.v1.losses. mean_squared_error, gradient_clipping: Optional[float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, emit_policy_info: Tuple[Text, ...] = (), train_step_counter: Optional[tf.Variable] = None, laplacian_matrix: Optional[types.Float] = None, laplacian_smoothing_weight: float = 0.001, name: Optional[Text] = None): """Creates a Greedy Reward Network Prediction Agent. In some use cases, the actions are not independent and they are related to each other (e.g., when the actions are ordinal integers). Assuming that the relations between arms can be modeled by a graph, we may want to enforce that the estimated reward function is smooth over the graph. This implies that the estimated rewards `r_i` and `r_j` for two related actions `i` and `j`, should be close to each other. To quantify this smoothness criterion we use the Laplacian matrix `L` of the graph over the actions. When the laplacian smoothing is enabled, the loss is extended to: ``` Loss_new := Loss + lambda r^T * L * r, ``` where `r` is the estimated reward vector for all actions. The second term is the laplacian smoothing regularization term and `lambda` is the weight that determines how strongly we enforce the regularization. For more details, please see: "Bandits on graphs and structures", Michal Valko https://hal.inria.fr/tel-01359757/document Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. reward_network: A `tf_agents.network.Network` to be used by the agent. The network will be called with call(observation, step_type) and it is expected to provide a reward prediction for all actions. optimizer: The optimizer to use for training. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. constraints: iterable of constraints objects that are instances of `tf_agents.bandits.agents.NeuralConstraint`. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. laplacian_matrix: A float `Tensor` or a numpy array shaped `[num_actions, num_actions]`. This holds the Laplacian matrix used to regularize the smoothness of the estimated expected reward function. This only applies to problems where the actions have a graph structure. If `None`, the regularization is not applied. laplacian_smoothing_weight: A float that determines the weight of the regularization term. Note that this has no effect if `laplacian_matrix` above is `None`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the action spec contains more than one action or or it is not a bounded scalar int32 spec with minimum 0. InvalidArgumentError: if the Laplacian provided is not None and not valid. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._accepts_per_arm_features = accepts_per_arm_features self._constraints = constraints reward_network.create_variables() self._reward_network = reward_network self._optimizer = optimizer self._error_loss_fn = error_loss_fn self._gradient_clipping = gradient_clipping self._heteroscedastic = isinstance( reward_network, heteroscedastic_q_network.HeteroscedasticQNetwork) self._laplacian_matrix = None if laplacian_matrix is not None: self._laplacian_matrix = tf.convert_to_tensor(laplacian_matrix, dtype=tf.float32) # Check the validity of the laplacian matrix. tf.debugging.assert_near( 0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 1))) tf.debugging.assert_near( 0.0, tf.norm(tf.reduce_sum(self._laplacian_matrix, 0))) self._laplacian_smoothing_weight = laplacian_smoothing_weight policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network, observation_and_action_constraint_splitter, constraints=constraints, accepts_per_arm_features=accepts_per_arm_features, emit_policy_info=emit_policy_info) training_data_spec = None if accepts_per_arm_features: training_data_spec = bandit_spec_utils.drop_arm_observation( policy.trajectory_spec) super(GreedyRewardPredictionAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy=policy, train_sequence_length=None, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, train_step_counter=train_step_counter)
def custom_loss(y_true, y_pred): return tf.norm(tf.norm(y_true - y_pred, ord=1, axis=(1, 2)), axis=1)
net = Net(x, weights, biases) # ------ Loss + Regularizer ------ with tf.name_scope("Loss"): # define loss with tf.name_scope("cross_entropy"): ce_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=net, labels=y)) # define regularizer with tf.name_scope("regularizer"): with tf.name_scope("jacobians"): jacobians = tf_jacobian(net, x, batch_size) with tf.name_scope("regularizer_cal"): regularizer = tf.reduce_sum( tf.norm(tf.gather(jacobians, ind_i, axis=2) - tf.gather(jacobians, ind_j, axis=2), axis=1) * similarities) # get final loss by adding loss and regularizer customized_loss = tf.add(ce_loss, args.lambda_reg * regularizer) # define optimizer with tf.name_scope("optimizer"): optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(customized_loss) # define accuracy with tf.name_scope("accuracy"): prediction = tf.nn.softmax(net) correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def forward(self, x, sequence_length=None, scope="RNN"): rnn = tf.nn.rnn_cell with tf.variable_scope(scope, reuse=tf.AUTO_REUSE ): # initializer=tf.orthogonal_initializer(), # scope.reuse_variables() # or tf.get_variable_scope().reuse_variables() # current_batch_of_words does not correspond to a "sentence" of words # but [t_steps, batch_size, num_features] # Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. # sequence_length list tensors of shape (batch_size, embedding_dim) if not self._use_dynamic: x = tf.unstack(tf.transpose(x, perm=[1, 0, 2])) # `static_rnn` input if self._rnn_cell.lower() == 'lstm': rnn_cell = rnn.LSTMCell elif self._rnn_cell.lower() == 'gru': rnn_cell = rnn.GRUCell elif self._rnn_cell.lower() == 'rnn': rnn_cell = rnn.BasicRNNCell else: raise ValueError("Invalid rnn_cell type.") with tf.variable_scope("fw"): # state(c, h), tf.nn.rnn_cell.BasicLSTMCell does not support gradient clipping, use tf.nn.rnn_cell.LSTMCell. # fw_cells = [rnn_cell(hidden_units) for _ in range(num_layers)] fw_cells = [] for _ in range(self._num_layers): fw_cell = rnn_cell(self._hidden_units) fw_cell = rnn.DropoutWrapper( fw_cell, output_keep_prob=self._dropout_keep_prob, variational_recurrent=False, dtype=tf.float32) fw_cells.append(fw_cell) fw_cells = rnn.MultiRNNCell(cells=fw_cells, state_is_tuple=True) with tf.variable_scope("bw"): bw_cells = [] for _ in range(self._num_layers): bw_cell = rnn_cell(self._hidden_units) bw_cell = rnn.DropoutWrapper( bw_cell, output_keep_prob=self._dropout_keep_prob, variational_recurrent=False, dtype=tf.float32) bw_cells.append(bw_cell) bw_cells = rnn.MultiRNNCell(cells=bw_cells, state_is_tuple=True) if self._use_dynamic: # [batch_size, max_time, cell_fw.output_size] outputs, output_states = tf.nn.bidirectional_dynamic_rnn( fw_cells, bw_cells, x, sequence_length=sequence_length, dtype=tf.float32) outputs = tf.concat(outputs, 2) if self._rnn_cell.lower() == 'lstm': out = tf.concat( [output_states[-1][0].h, output_states[-1][1].h], 1) else: out = tf.concat( [output_states[-1][0], output_states[-1][1]], 1) # outputs = outputs[:, -1, :] # take last hidden states (batch_size, 2*hidden_units) # outputs = self._last_relevant(outputs, sequence_length) else: # `static_rnn` Returns: A tuple (outputs, output_state_fw, output_state_bw) # outputs is a list of timestep outputs, depth-concatenated forward and backward outputs. outputs, state_fw, state_bw = tf.nn.static_bidirectional_rnn( fw_cells, bw_cells, x, dtype=tf.float32, sequence_length=sequence_length) outputs = tf.transpose(tf.stack(outputs), perm=[1, 0, 2]) if self._rnn_cell.lower() == 'lstm': out = tf.concat([state_fw[-1].h, state_bw[-1].h], 1) # good else: out = tf.concat([state_fw[-1], state_bw[-1]], 1) # outputs = tf.reduce_mean(outputs, 0) # average [batch_size, hidden_units] (mean pooling) # outputs = tf.reduce_max(outputs, axis=0) # max pooling, bad result. # outputs = outputs[-1] # take last hidden state [batch_size, hidden_units] # outputs = tf.transpose(tf.stack(outputs), [1, 0, 2]) # shape(batch_size, seq_len, hidden_units) # outputs = self._last_relevant(outputs, sequence_length) if self._use_attention: d_a = 300 r = 2 self.H = outputs batch_size = tf.shape(x)[0] initializer = tf.contrib.layers.xavier_initializer() with tf.variable_scope( "attention" ): # TODO: Nan in summary histogram for: RNN/attention/W_s2_0/grad/hist # shape(W_s1) = d_a * 2u self.W_s1 = tf.get_variable( 'W_s1', shape=[d_a, 2 * self._hidden_units], initializer=initializer) # shape(W_s2) = r * d_a self.W_s2 = tf.get_variable('W_s2', shape=[r, d_a], initializer=initializer) # shape (d_a, 2u) --> shape(batch_size, d_a, 2u) self.W_s1 = tf.tile(tf.expand_dims(self.W_s1, 0), [batch_size, 1, 1]) self.W_s2 = tf.tile(tf.expand_dims(self.W_s2, 0), [batch_size, 1, 1]) # attention matrix A = softmax(W_s2*tanh(W_s1*H^T) shape(A) = batch_siz * r * n self.H_T = tf.transpose(self.H, perm=[0, 2, 1], name="H_T") self.A = tf.nn.softmax( tf.matmul(self.W_s2, tf.tanh(tf.matmul(self.W_s1, self.H_T)), name="A")) # sentences embedding matrix M = AH shape(M) = (batch_size, r, 2u) self.M = tf.matmul(self.A, self.H, name="M") out = tf.reshape(self.M, [batch_size, -1]) with tf.variable_scope("penalization"): # penalization term: Frobenius norm square of matrix AA^T-I, ie. P = |AA^T-I|_F^2 A_T = tf.transpose(self.A, perm=[0, 2, 1], name="A_T") I = tf.eye(r, r, batch_shape=[batch_size], name="I") self.P = tf.square(tf.norm(tf.matmul(self.A, A_T) - I, axis=[-2, -1], ord='fro'), name="P") return out