def batch_norm(input_, dim, name, scale=True, train=True, epsilon=1e-8, decay=.1, axes=[0], bn_lag=DEFAULT_BN_LAG): """Batch normalization.""" # create variables with tf.variable_scope(name): var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) if scale: gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.)) beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.)) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_mean /= (1. - bn_lag**(step + 1)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # normalize res = (input_ - used_mean) / tf.sqrt(used_var + epsilon) # de-normalize if scale: res *= gamma res += beta # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) res += 0. * new_mean * new_var * new_step return res
def batch_norm_log_diff(input_, dim, name, train=True, epsilon=1e-8, decay=.1, axes=[0], reuse=None, bn_lag=DEFAULT_BN_LAG): """Batch normalization with corresponding log determinant Jacobian.""" if reuse is None: reuse = not train # create variables with tf.variable_scope(name) as scope: if reuse: scope.reuse_variables() var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_var = stable_var(input_=input_, mean=used_mean, axes=axes) cur_var = used_var used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_mean /= (1. - bn_lag**(step + 1)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics( decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) used_var += 0. * new_mean * new_var * new_step used_var += epsilon return used_mean, used_var
def central_step(): # restore v1, slots op5 = tf.group(*[ tf.assign(w,v) for w,v in zip(restored_vars, tmp_vars)]) with tf.get_default_graph().control_dependencies([op5]): back = tf.group(*[tf.assign_sub(v, -self._lr_t*grad) for grad,v in grads_and_vars]) with tf.get_default_graph().control_dependencies([back]): return tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
def testInitRequiredAssignSub(self): with self.test_session(): p = tf.Variable(tf.fill([1024, 1024], 1), tf.int32) a = tf.assign_sub(p, tf.fill([1024, 1024], 0)) with self.assertRaisesOpError("use uninitialized"): a.op.run()
def _initAssignSubFetch(self, x, y, use_gpu=False): """Initialize a param to init, and compute param -= y.""" with self.test_session(use_gpu=use_gpu): p = tf.Variable(x) sub = tf.assign_sub(p, y) p.initializer.run() new_value = sub.eval() return p.eval(), new_value
def exponential_moving_average(self, var, avg_var=None, decay=0.999, ignore_nan=False): """Calculates the exponential moving average. TODO(): check if this implementation of moving average can now be replaced by tensorflows implementation. Adds a variable to keep track of the exponential moving average and adds an update operation to the bookkeeper. The name of the variable is '%s_average' % name prefixed with the current variable scope. Args: var: The variable for which a moving average should be computed. avg_var: The variable to set the average into, if None create a zero initialized one. decay: How much history to use in the moving average. Higher, means more history values [0, 1) accepted. ignore_nan: If the value is NaN or Inf, skip it. Returns: The averaged variable. Raises: ValueError: if decay is not in [0, 1). """ with self._g.as_default(): if decay < 0 or decay >= 1.0: raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay) if avg_var is None: avg_name = '%s_average' % _bare_var_name(var) with tf.control_dependencies(None): with tf.name_scope(avg_name + '/Initializer/'): if isinstance(var, tf.Variable): init_val = var.initialized_value() elif var.get_shape().is_fully_defined(): init_val = tf.constant(0, shape=var.get_shape(), dtype=var.dtype.base_dtype) else: init_val = tf.constant(0, dtype=var.dtype.base_dtype) avg_var = tf.Variable(init_val, name=avg_name, trainable=False) num_updates = tf.cast(self.global_step, tf.float32) decay = tf.minimum(decay, tf.maximum(0.9, (1.0 + num_updates) / (10.0 + num_updates))) with tf.device(avg_var.device): if ignore_nan: var = tf.where(tf.is_finite(var), var, avg_var) if var.get_shape().is_fully_defined(): avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var)) else: avg_update = tf.assign(avg_var, avg_var - (1 - decay) * (avg_var - var), validate_shape=False) self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update) return avg_update
def curl(): grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in zip(grads, all_vars)]) with tf.get_default_graph().control_dependencies([op3]): def curlcombine(g1,g2): stepsize = self._lr_t return g1-(g2-g1)/stepsize new_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) g3s = [curlcombine(g1,g2) for g1,g2 in zip(grads,new_grads)] return g3s
def _assign_sub(self, ref, updates, indices=None): if indices is not None: if isinstance(ref, tf.Variable): return tf.scatter_sub(ref, indices, updates, use_locking=self._use_locking) elif isinstance(ref, resource_variable_ops.ResourceVariable): with tf.control_dependencies([resource_variable_ops.resource_scatter_add(ref.handle, indices, -updates)]): return ref.value() else: raise TypeError("did not expect type %r" % type(ref)) else: return tf.assign_sub(ref, updates, use_locking=self._use_locking)
def _apply_dense(self, grad, var): lr_t = tf.cast(self._lr_t, var.dtype.base_dtype) beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype) if var.dtype.base_dtype == tf.float16: eps = 1e-7 else: eps = 1e-8 m = self.get_slot(var, "m") m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad))) g_t = grad / m_t var_update = tf.assign_sub(var, lr_t * g_t) return tf.group(*[var_update, m_t])
def sgd(cost, parameters=None, learning_rate=0.01): if parameters is None: parameters = tf.trainable_variables() grads = tf.gradients(cost, parameters) all_updates = [] for grad, param in zip(grads, parameters): assigned = tf.assign_sub(param, learning_rate * grad) all_updates.append(assigned) update_op = tf.group(*all_updates) return update_op
def _resource_apply_dense(self, grad, var): grad_squared = tf.square(grad) + 1e-30 grad_squared_mean = tf.reduce_mean(grad_squared) decay_rate = self._decay_rate update_scale = self._learning_rate if self._multiply_by_parameter_scale: update_scale *= self._parameter_scale(var) # HACK: Make things dependent on grad. # This confounds the XLA rewriter and keeps it from fusing computations # across different variables. This fusion is a bad for HBM usage, since # it causes the gradients to persist in memory. decay_rate += grad_squared_mean * 1e-30 update_scale += grad_squared_mean * 1e-30 # END HACK mixing_rate = 1.0 - decay_rate shape = var.get_shape().as_list() updates = [] if self._should_use_factored_second_moment_estimate(shape): grad_squared_row_mean = tf.reduce_mean(grad_squared, 1) grad_squared_col_mean = tf.reduce_mean(grad_squared, 0) vr = self.get_slot(var, "vr") new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean) vc = self.get_slot(var, "vc") new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean) vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking) vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking) updates = [vr_update, vc_update] long_term_mean = tf.reduce_mean(new_vr) r_factor = tf.rsqrt(new_vr / long_term_mean) c_factor = tf.rsqrt(new_vc) x = grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0) else: v = self.get_slot(var, "v") new_v = decay_rate * v + mixing_rate * grad_squared v_update = tf.assign(v, new_v, use_locking=self._use_locking) updates = [v_update] x = grad * tf.rsqrt(new_v) if self._clipping_threshold is not None: clipping_denom = tf.maximum(1.0, reduce_rms(x) / self._clipping_threshold) x /= clipping_denom subtrahend = update_scale * x if self._beta1: m = self.get_slot(var, "m") new_m = self._beta1 * m + (1.0 - self._beta1) * subtrahend updates.append(tf.assign(m, new_m, use_locking=self._use_locking)) subtrahend = new_m var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking) updates = [var_update] + updates return tf.group(*updates)
def adam(cost, parameters=None, learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8): if parameters is None: parameters = tf.trainable_variables() grads = tf.gradients(cost, parameters) all_updates = [] zero_init = tf.constant_initializer(0.) with tf.variable_scope("adam"): t_prev = tf.get_variable("t", shape=(), initializer=zero_init) t = tf.assign_add(t_prev, 1) all_updates.append(t) for grad, param in zip(grads, parameters): with tf.variable_scope(param.name.replace(":", "_")): param_shape = tfu.get_shape_values(param) m_prev = tf.get_variable("m", shape=param_shape, initializer=zero_init) v_prev = tf.get_variable("v", shape=param_shape, initializer=zero_init) m = tf.assign(m_prev, m_prev * beta1 + grad * (1 - beta1)) v = tf.assign(v_prev, v_prev * beta2 + tf.square(grad) * (1 - beta2)) numerator = learning_rate * m / (1 - tf.pow(beta1, t)) denominator = tf.sqrt(v / (1 - tf.pow(beta2, t))) + epsilon assigned = tf.assign_sub(param, numerator / denominator) all_updates += [m, v, assigned] update_op = tf.group(*all_updates) return update_op
def exponential_moving_average( self, var, avg_var=None, decay=0.999, ignore_nan=False): """Calculates the exponential moving average. Adds a variable to keep track of the exponential moving average and adds an update operation to the bookkeeper. The name of the variable is '%s_average' % name prefixed with the current variable scope. Args: var: The variable for which a moving average should be computed. avg_var: The variable to set the average into, if None create a zero initialized one. decay: How much history to use in the moving average. Higher, means more history values [0, 1) accepted. ignore_nan: If the value is NaN or Inf, skip it. Returns: The averaged variable. Raises: ValueError: if decay is not in [0, 1). """ with self.g.as_default(): if decay < 0 or decay >= 1.0: raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay) if not avg_var: shape = var.get_shape() avg_name = '%s_average' % _bare_var_name(var) avg_var = tf.Variable( tf.zeros_initializer(shape=shape, dtype=var.dtype), name=avg_name, trainable=False) num_updates = tf.cast(self.global_step, tf.float32) decay = tf.maximum( 0.9, tf.minimum(decay, (1.0 + num_updates) / (10.0 + num_updates))) with tf.device(avg_var.device): if ignore_nan: var = tf.select(tf.is_finite(var), var, avg_var) avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var)) self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update) return avg_var
def _update_params(self, ema, g, v): """Create ops to update trainable parameters""" return tf.assign_sub(v, self._eta_max * ema['u'] * g)
def _apply_dense(self, grad: tf.Tensor, var: tf.Variable) -> tf.Operation: """Add ops to apply dense gradients to `var`. Args: grad: A gradient `Tensor`. var: A `Variable` object. Returns: An `Operation`. """ alpha_t = tf.cast(self._alpha_t, var.dtype.base_dtype) lr_update_t = tf.cast(self._lr_update_t, var.dtype.base_dtype) lr_max_t = tf.cast(self._lr_max_t, var.dtype.base_dtype) lr_min_t = tf.cast(self._lr_min_t, var.dtype.base_dtype) m_coef_update_t = tf.cast(self._momentum_coef_update_t, var.dtype.base_dtype) # get cached tensors old_grad = self.get_slot(var, "old_grad") momentum = self.get_slot(var, "momentum") # learnable stuff lr = self.get_slot(var, "lr") m_coef = self.get_slot(var, "m_coef") # generate random noise noise = alpha_t * tf.random_uniform( shape=tf.shape(var), minval=-1.0, maxval=+1.0) # compute aggregated gradient momentum_grad = momentum * m_coef + grad with tf.control_dependencies([momentum_grad]): if self._norm_type == 'max': # compute normalization constant g_max = tf.reduce_max(tf.abs(momentum_grad)) denominator = _EPSILON + g_max g_update_normed = momentum_grad / denominator elif self._norm_type == 'std': std = tf.keras.backend.std(momentum_grad) + _EPSILON g_update_normed = momentum_grad / std else: g_update_normed = tf.nn.l2_normalize(momentum_grad) # compute update grad update_grad = lr * (g_update_normed + noise) var_update = tf.assign_sub(var, update_grad) update_m = tf.assign(momentum, momentum_grad) # compute gradient correlation g_normed = tf.nn.l2_normalize(grad) old_g_normed = tf.nn.l2_normalize(old_grad) lr_change = -tf.reduce_sum(g_normed * old_g_normed) # update learning rate new_lr = lr * (1 - lr_update_t * lr_change) new_lr = tf.clip_by_value(new_lr, lr_min_t, lr_max_t) # update momentum beta = 1 - m_coef_update_t new_m_coef = m_coef * beta + (1 - beta) * lr_change new_m_coef = tf.clip_by_value(new_m_coef, 0.0, 1.0) self._grad_correlation_t = lr_change with tf.control_dependencies([new_lr, new_m_coef]): lr_update = tf.assign(lr, new_lr) m_update = tf.assign(m_coef, new_m_coef) old_g_update = tf.assign(old_grad, grad) return tf.group( [update_m, var_update, lr_update, old_g_update, m_update])
def update_sub(x, decrement): return tf.assign_sub(x, decrement)
y: x_values_sm_b, modulation: np.zeros((batch_size, num_steps, 1)), state: get_zero_state() }) duration = time.time() - start_time error = np.sum(np.square(out_v_test[-1][0]/c.lambda_max - x_values_sm_b)) dw_grads.append(state_v[0][5]) db_grads.append(state_v[0][6]) r.append(rhythm) print "Epoch {} ({:.2f}s), train error {:.3f}".format( i, duration, error ) r = np.asarray(r) dw_grads = np.asarray(dw_grads) db_grads = np.asarray(db_grads) dw_grads_m = np.mean(dw_grads, 0) # dw_grads_m = 2.0*dw_bl # db_grads = 2.0*dbias_bl sess.run(tf.assign_sub(net.cells[-1].params[0], 10.0*dw_grads_m.reshape(input_size, output_size))) sess.run(tf.assign_sub(net.cells[-1].params[1], 10.0*np.mean(db_grads).reshape(1)))
def apply_updates(self) -> tf.Operation: """Construct training op to update the registered variables based on their gradients.""" tfutil.assert_tf_initialized() assert not self._updates_applied self._updates_applied = True devices = list(self._dev_grads.keys()) total_grads = sum(len(grads) for grads in self._dev_grads.values()) assert len(devices) >= 1 and total_grads >= 1 ops = [] with tfutil.absolute_name_scope(self.scope): # Cast gradients to FP32 and calculate partial sum within each device. dev_grads = OrderedDict() # device => [(grad, var), ...] for dev_idx, dev in enumerate(devices): with tf.name_scope("ProcessGrads%d" % dev_idx), tf.device(dev): sums = [] for gv in zip(*self._dev_grads[dev]): assert all(v is gv[0][1] for g, v in gv) g = [tf.cast(g, tf.float32) for g, v in gv] g = g[0] if len(g) == 1 else tf.add_n(g) sums.append((g, gv[0][1])) dev_grads[dev] = sums # Sum gradients across devices. if len(devices) > 1: with tf.name_scope("SumAcrossGPUs"), tf.device(None): for var_idx, grad_shape in enumerate(self._grad_shapes): g = [dev_grads[dev][var_idx][0] for dev in devices] if np.prod( grad_shape ): # nccl does not support zero-sized tensors g = tf.contrib.nccl.all_sum(g) for dev, gg in zip(devices, g): dev_grads[dev][var_idx] = ( gg, dev_grads[dev][var_idx][1]) # Apply updates separately on each device. for dev_idx, (dev, grads) in enumerate(dev_grads.items()): with tf.name_scope("ApplyGrads%d" % dev_idx), tf.device(dev): # Scale gradients as needed. if self.use_loss_scaling or total_grads > 1: with tf.name_scope("Scale"): coef = tf.constant(np.float32(1.0 / total_grads), name="coef") coef = self.undo_loss_scaling(coef) grads = [(g * coef, v) for g, v in grads] # Check for overflows. with tf.name_scope("CheckOverflow"): grad_ok = tf.reduce_all( tf.stack([ tf.reduce_all(tf.is_finite(g)) for g, v in grads ])) # Update weights and adjust loss scaling. with tf.name_scope("UpdateWeights"): # pylint: disable=cell-var-from-loop opt = self._dev_opt[dev] ls_var = self.get_loss_scaling_var(dev) if not self.use_loss_scaling: ops.append( tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op)) else: ops.append( tf.cond( grad_ok, lambda: tf.group( tf.assign_add(ls_var, self. loss_scaling_inc), opt.apply_gradients(grads)), lambda: tf.group( tf.assign_sub(ls_var, self. loss_scaling_dec)))) # Report statistics on the last device. if dev == devices[-1]: with tf.name_scope("Statistics"): ops.append( autosummary.autosummary( self.id + "/learning_rate", self.learning_rate)) ops.append( autosummary.autosummary( self.id + "/overflow_frequency", tf.where(grad_ok, 0, 1))) if self.use_loss_scaling: ops.append( autosummary.autosummary( self.id + "/loss_scaling_log2", ls_var)) # Initialize variables and group everything into a single op. self.reset_optimizer_state() tfutil.init_uninitialized_vars(list(self._dev_ls_var.values())) return tf.group(*ops, name="TrainingOp")
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): self.prob_of_random_goal = tf.Variable(FLAGS.initial_random_goal_prob, trainable=False, name="prob_of_random_goal", dtype=tf.float32) self.inputs = tf.placeholder(shape=[None, FLAGS.resized_height, FLAGS.resized_width, FLAGS.agent_history_length], dtype=tf.float32, name="Inputs") self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards") self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards, dtype=tf.int32), 2, dtype=tf.float32, name="Prev_Rewards_OneHot") self.prev_rewards = tf.expand_dims(self.prev_rewards, 1, name="rewards") # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0) self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_Goals") self.image_summaries = [] if FLAGS.game not in flags.SUPPORTED_ENVS: self.conv0 = tf.contrib.layers.conv2d( self.inputs, 16, 8, 4, activation_fn=tf.nn.elu, scope="conv0") with tf.variable_scope('conv0'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) self.conv = tf.contrib.layers.conv2d( self.conv0, 32, 4, 2, activation_fn=tf.nn.elu, scope="conv1") else: self.conv = tf.contrib.layers.conv2d( self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1") with tf.variable_scope('conv1'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) with tf.variable_scope('inputs'): tf.get_variable_scope().reuse_variables() self.image_summaries.append( tf.summary.image('input', self.inputs, max_outputs=100)) self.conv_flat = tf.contrib.layers.flatten(self.conv) self.fc = tf.contrib.layers.fully_connected(self.conv_flat, FLAGS.hidden_dim) self.fc = tf.contrib.layers.layer_norm(self.fc) self.f_percept = tf.nn.elu(self.fc, name="Zt") if FLAGS.game not in flags.SUPPORTED_ENVS: self.f_percept = tf.concat( [self.f_percept, self.prev_rewards], 1, name="Zt_r") else: self.f_percept = tf.concat( [self.f_percept, self.prev_rewards_onehot], 1, name="Zt_r") summary_f_percept_act = tf.contrib.layers.summarize_activation(self.f_percept) ############################################################################################################ # Manager network if FLAGS.meta: self.f_Mspace = tf.concat( [self.f_percept, self.prev_goal], 1, name="Zt_r") else: self.f_Mspace = tf.identity(self.f_percept, name="Zt_r") self.f_Mspace = tf.contrib.layers.fully_connected(self.f_Mspace, FLAGS.hidden_dim) self.f_percept = tf.concat( [self.f_percept, self.prev_actions_onehot], 1, name="Zt_r") self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace) self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St") summary_f_Mspace_act = tf.contrib.layers.summarize_activation(self.f_Mspace) m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in") step_size = tf.shape(self.inputs)[:1] m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.hidden_dim) m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) self.m_state_init = [m_c_init, m_h_init] m_c_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_c_in") m_h_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_h_in") self.m_state_in = (m_c_in, m_h_in) m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in) m_lstm_outputs, m_lstm_state = self.fast_dlstm(m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon, FLAGS.hidden_dim * FLAGS.manager_horizon) m_lstm_c, m_lstm_h = m_lstm_state self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :]) self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim]) self.normalized_goals = tf.contrib.layers.fully_connected(self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt") summary_goals = tf.contrib.layers.summarize_activation(self.normalized_goals) def randomize_goals(t): t = tf.cast(t, tf.int32) packed_tensors = tf.stack([tf.random_normal([FLAGS.hidden_dim, ]), self.normalized_goals[t, :]]) to_update = tf.cond( tf.less(self.prob_of_random_goal, tf.constant(FLAGS.final_random_goal_prob, dtype=tf.float32)), lambda: tf.cast( tf.multinomial( tf.log([[self.prob_of_random_goal, tf.subtract(tf.constant(1.0), self.prob_of_random_goal)]]), 1)[0][0], tf.int32), lambda: tf.constant(1, tf.int32)) resulted_tensor = tf.gather(packed_tensors, to_update) return resulted_tensor self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t), tf.to_float(tf.range(0, step_size[0])), name="random_gt") summary_random_goals = tf.contrib.layers.summarize_activation(self.randomized_goals) self.decrease_prob_of_random_goal = tf.assign_sub(self.prob_of_random_goal, tf.constant( (FLAGS.initial_random_goal_prob - FLAGS.final_random_goal_prob) / FLAGS.explore_steps)) m_fc_value_w = tf.get_variable("M_Value_W", shape=[FLAGS.hidden_dim, 1], initializer=normalized_columns_initializer(1.0)) self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value") summary_m_value_act = tf.contrib.layers.summarize_activation(self.m_value) ############################################################################################################ # Worker network self.sum_prev_goals = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_c_Goals_sum") w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in") step_size = tf.shape(self.inputs)[:1] w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.goal_embedding_size * FLAGS.nb_actions) w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32) w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32) self.w_state_init = [w_c_init, w_h_init] w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c], name="Wrnn_c_in") w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h], name="Wrnn_h_in") self.w_state_in = (w_c_in, w_h_in) w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in) w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn( w_lstm_cell, w_rnn_in, initial_state=w_state_in, sequence_length=step_size, time_major=False) w_lstm_c, w_lstm_h = w_lstm_state self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :]) Ut = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size], name="Ut") Ut_flat = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size], name="Ut_flat") summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut) goal_encoding = tf.contrib.layers.fully_connected(self.sum_prev_goals, FLAGS.goal_embedding_size, biases_initializer=None, scope="goal_emb") interm_rez = tf.squeeze(tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2) interm_rez = tf.contrib.layers.flatten(interm_rez) self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy") summary_w_policy_act = tf.contrib.layers.summarize_activation(self.w_policy) w_fc_value_w = tf.get_variable("W_Value_W", shape=[FLAGS.nb_actions * FLAGS.goal_embedding_size + FLAGS.goal_embedding_size, 1], initializer=normalized_columns_initializer(1.0)) self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1), w_fc_value_w, name="W_Value") summary_w_value_act = tf.contrib.layers.summarize_activation(self.w_value) if scope != 'global': self.w_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.m_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.w_intrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) def gather_state_at_horiz(t): t = tf.cast(t, tf.int32) f_Mspace_c = tf.gather(self.f_Mspace, tf.minimum(t + tf.constant(FLAGS.manager_horizon, dtype=tf.int32), step_size[0] - 1)) return f_Mspace_c self.f_Mspace_c = tf.cast( tf.map_fn(lambda t: gather_state_at_horiz(t), tf.to_float(tf.range(0, step_size[0])), name="state_at_horiz"), dtype=tf.float32) self.state_diff = self.f_Mspace_c - self.f_Mspace self.cos_sim_state_diff = self.cosine_distance(tf.stop_gradient(self.state_diff), self.normalized_goals, dim=1) self.m_advantages = self.m_extrinsic_return - tf.stop_gradient(tf.reshape(self.m_value, [-1])) self.goals_loss = - tf.reduce_sum(self.m_advantages * self.cos_sim_state_diff) self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum( tf.square(self.m_extrinsic_return - tf.reshape(self.m_value, [-1]))) self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.responsible_outputs = tf.reduce_sum(self.w_policy * self.actions_onehot, [1]) self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return self.total_return = self.w_extrinsic_return + self.intrinsic_return self.w_advantages = self.total_return - tf.stop_gradient(tf.reshape(self.w_value, [-1])) # Loss functions self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum( tf.square(self.total_return - tf.reshape(self.w_value, [-1]))) self.entropy = - tf.reduce_sum(self.w_policy * tf.log(self.w_policy + 1e-7)) self.w_policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.w_advantages) - self.entropy * FLAGS.beta_e self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value) self.worker_summaries = [summary_f_percept_act, summary_f_Mspace_act, summary_goals, summary_random_goals, summary_m_value_act, summary_wrnn_act, summary_w_policy_act, summary_w_value_act] for grad, weight in zip(grads, local_vars): self.worker_summaries.append(tf.summary.histogram(weight.name + '_grad', grad)) self.worker_summaries.append(tf.summary.histogram(weight.name, weight)) self.merged_summary = tf.summary.merge(self.worker_summaries) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
def train_eq_op(self, X, y, z, epochs=32, batch_size=1): """ Train a model with (positive class) equality of opportunity debiasing. Inputs: X: np.ndarray [N, F] -- Instances over F features. y: np.ndarray [N, 1 of {0,1}] -- Target class. z: np.ndarray [N, 1 of {0,1}] -- Group membership. Returns nothing but updates self.classifier """ #raise NotImplementedError('You need to implement this.') # SOLUTION # END OF SOLUTION # Model inp = Input(1) # for giving y as input to the adversary next_layer = tf.keras.layers.Concatenate(axis=1)( [self.classifier.output, inp]) out = Dense(1, activation='sigmoid')(next_layer) adversary = Model([self.classifier.input, inp], out) # The following part is same as dem_parity (Only difference is now y is given as input to the adversary) # Defining Tensors Operations Y = tf.placeholder(tf.float32, shape=[None, 1]) Z = tf.placeholder(tf.float32, shape=[None, 1]) class_params = adversary.trainable_weights[:-2] adv_params = adversary.trainable_weights[-2:] outputs = [layer.output for layer in adversary.layers] l_p = K.mean(K.binary_crossentropy(Y, outputs[4], from_logits=False)) loss_p = K.function([adversary.input, Y], l_p) l_a = K.mean(K.binary_crossentropy(Z, outputs[-1], from_logits=False)) loss_a = K.function([ adversary.input, tf.concat((outputs[4], adversary.layers[5].input), -1), Z ], l_a) grads_adv = tf.gradients(ys=l_a, xs=adv_params) grads_class = tf.gradients(ys=l_p, xs=class_params) grads_class_adv = tf.gradients(ys=l_a, xs=class_params) gradients_adv = K.function([ adversary.input, tf.concat((outputs[4], adversary.layers[5].input), -1), Z ], grads_adv) gradients_class = K.function([adversary.input, Y], grads_class) gradients_class_adv = K.function([ adversary.input, tf.concat((outputs[4], adversary.layers[5].input), -1), Z ], grads_class_adv) num = len(X) // batch_size #sess.run(tf.global_variables_initializer()) for epoch in range(epochs): learning_rate = 1 / (epoch + 1) alpha = np.sqrt(epoch + 1) c = 0 loss_class = 0 loss_adv = 0 outer = tqdm(total=num, desc='Train epochs', position=0) for b in range(0, len(X), batch_size): outer.update(1) c = c + 1 # Notations same as dem parity trainer l1 = loss_p([ X[b:b + batch_size], y[b:b + batch_size], y[b:b + batch_size] ]) l2 = loss_a([ X[b:b + batch_size], z[b:b + batch_size], z[b:b + batch_size], z[b:b + batch_size] ]) clas = gradients_class([ X[b:b + batch_size], y[b:b + batch_size], y[b:b + batch_size] ]) adv = gradients_adv([ X[b:b + batch_size], y[b:b + batch_size], z[b:b + batch_size], z[b:b + batch_size] ]) clasadv = gradients_class_adv([ X[b:b + batch_size], y[b:b + batch_size], z[b:b + batch_size], z[b:b + batch_size] ]) for i in range(len(adversary.trainable_weights)): if i > 7: sess.run( tf.assign_sub(adversary.trainable_weights[i], learning_rate * adv[i - 8])) else: k = self.projection_weights(clas[i], clasadv[i]) grad = clas[i] - k - alpha * clasadv[i] sess.run( tf.assign_sub(adversary.trainable_weights[i], learning_rate * grad)) loss_class += l1 loss_adv += l2 del l1, l2, clas, adv, clasadv, k, grad y_pred = (self.classifier.predict(X) > 0.5) * 1 acc1 = (y_pred == y).mean() y_pred1 = (adversary.predict([X, y]) > 0.5) * 1 acc2 = (y_pred1 == z).mean() print('Epoch: ', epoch + 1) print('Demographic Parity: ', evaluate_dem_parity(y_pred, y, z)) print('Equality of Opportunity: ', evaluate_eq_op(y_pred, y, z)) print('Classification Loss: ', loss_class / c) print('Adversarial Loss: ', loss_adv / c) print('Classifier Accuracy: ', acc1) print('Adversary Accuracy: ', acc2) del y_pred, y_pred1
def train_dem_parity(self, X, y, z, epochs=32, batch_size=1024): """ Train a model with (positive class) demographic parity. Inputs: X: np.ndarray [N, F] -- Instances over F features. y: np.ndarray [N, 1] -- Target class. z: np.ndarray [N, 1] -- Group membership. Returns nothing but updates self.classifier """ #raise NotImplementedError('You need to implement this.') # SOLUTION # END OF SOLUTION #K.clear_session() #sess = tf.Session() #K.set_session(sess) adversary = self._get_adversary_architecture( ) #getting the adversary model # Defining Tensors Operations Y = tf.placeholder(tf.float32, shape=[None, 1]) # placeholder for true labels Z = tf.placeholder(tf.float32, shape=[None, 1]) # placeholder for protected attribute class_params = adversary.trainable_weights[: -2] #parameters of the classifier adv_params = adversary.trainable_weights[ -2:] #parameters of the adversary outputs = [layer.output for layer in adversary.layers ] #getting the symbolic tensors of all layers l_p = K.mean(K.binary_crossentropy( Y, outputs[-2], from_logits=False)) #classifier loss loss_p = K.function([adversary.input, Y], l_p) l_a = K.mean(K.binary_crossentropy(Z, outputs[-1], from_logits=False)) #adversary loss loss_a = K.function([adversary.input, Z], l_a) grads_adv = tf.gradients(ys=l_a, xs=adv_params) #Adversary gradients grads_class = tf.gradients(ys=l_p, xs=class_params) #Classifier gradients grads_class_adv = tf.gradients( ys=l_a, xs=class_params) #classifier gradients wrt adversary loss gradients_adv = K.function([adversary.input, Z], grads_adv) gradients_class = K.function([adversary.input, Y], grads_class) gradients_class_adv = K.function([adversary.input, Z], grads_class_adv) num = len(X) // batch_size #sess.run(tf.global_variables_initializer()) for epoch in range(epochs): outer = tqdm(total=num, desc='Train epochs', position=0) learning_rate = 1 / (epoch + 1) alpha = np.sqrt(epoch + 1) loss_class = 0 loss_adv = 0 c = 0 for b in range(0, len(X), batch_size): outer.update(1) c = c + 1 l1 = loss_p([X[b:b + batch_size], y[b:b + batch_size]]) #classifier loss l2 = loss_a([X[b:b + batch_size], z[b:b + batch_size]]) #adversary loss clas = gradients_class( [X[b:b + batch_size], y[b:b + batch_size]]) #classifier gradients adv = gradients_adv([X[b:b + batch_size], z[b:b + batch_size] ]) #adversary gradients clasadv = gradients_class_adv([ X[b:b + batch_size], z[b:b + batch_size] ]) #classifier gradient wrt adversary loss for i in range(len(adversary.trainable_weights)): if i > 7: sess.run( tf.assign_sub( adversary.trainable_weights[i], learning_rate * adv[i - 8])) #adversary weight update else: k = self.projection_weights(clas[i], clasadv[i]) grad = clas[i] - k - alpha * clasadv[i] sess.run( tf.assign_sub(adversary.trainable_weights[i], learning_rate * grad)) #classifier weight update loss_class += l1 loss_adv += l2 del l1, l2, clas, adv, clasadv, k, grad y_pred = (self.classifier.predict(X) > 0.5) * 1 acc1 = (y_pred == y).mean() y_pred1 = (adversary.predict(X) > 0.5) * 1 acc2 = (y_pred1 == z).mean() print('Epoch: ', epoch + 1) print('Demographic Parity: ', evaluate_dem_parity(y_pred, y, z)) print('Equality of Opportunity: ', evaluate_eq_op(y_pred, y, z)) print('Classification Loss: ', loss_class / c) print('Adversarial Loss: ', loss_adv / c) print('Classification Accuracy: ', acc1) print('Adversary Accuracy: ', acc2) del y_pred, y_pred1
def _anneal_learning_rate(self): return tf.cond( self.learning_rate > 0.0, lambda: tf.assign_sub(self.learning_rate, self.delta_lr), lambda: tf.assign(self.learning_rate, 0.0))
def _apply_dense(self, grad, var): # SM3 upper bounds the gradient square sums: # # To illustrate: # # For a Tensor `T` of shape [M, N, K]. # # `G` be its gradient of shape [M, N, K] # # SM3 keeps around three accumulators A1, A2, A3 of size M, N, K # respectively. # # `A` be the accumulator of shape [M, N, K]. `A` is not materialized until # its needed for every step, and is approximated by A1, A2, A3. # # At every gradient update step the accumulators satisify: # A1_t[i] >= Sum_{s <= t} G_t[i, j, k]^2 for all j, k. # A2_t[j] >= Sum_{s <= t} G_t[i, j, k]^2 for all i, k. # A3_t[k] >= Sum_{s <= t} G_t[i, j, k]^2 for all i, j. # # The RHS is the gradient sum squares. # # For every step we materialize the tensor `A` based on accumulated tensors # A1, A2 and A3. # # A = min(A1[i], A2[j], A3[j]) + G[i, j, k]^2 # # SM3 preconditioned gradient is # # preconditioned G = A^{-0.5} * G # # We then update the individual accumulator factors as: # # A1[i] = max_{j, k} A[i, j, k] # A2[j] = max_{i, k} A[i, j, k] # A3[k] = max_{i, j} A[i, j, k] # shape = np.array(var.get_shape()) var_rank = len(shape) if var_rank > 1: accumulator_list = [ self.get_slot(var, "accumulator_" + str(i)) for i in range(var_rank) ] accumulator = self._compute_past_accumulator( accumulator_list, shape) accumulator += grad * grad else: accumulator_var = self.get_slot(var, "accumulator") accumulator = tf.assign_add(accumulator_var, grad * grad) accumulator_inv_sqrt = tf.where(tf.greater(accumulator, 0), tf.rsqrt(accumulator), tf.zeros_like(accumulator)) scaled_g = (1.0 - self._momentum_tensor) * (grad * accumulator_inv_sqrt) accumulator_update_ops = [] with tf.control_dependencies([scaled_g]): if var_rank > 1: # Updates individual accumulator factors as: # A1[i] = max_{j, k} A[i, j, k] # A2[j] = max_{i, k} A[i, j, k] # A3[k] = max_{i, j} A[i, j, k] for i, accumulator_i in enumerate(accumulator_list): axes = list(range(i)) + list(range(i + 1, var_rank)) new_accumulator_i = tf.reduce_max(accumulator, axis=axes) accumulator_update_ops.append( tf.assign(accumulator_i, new_accumulator_i)) with tf.control_dependencies(accumulator_update_ops): if self._momentum > 0: gbar = self.get_slot(var, "momentum") update = tf.assign_add( gbar, gbar * (self._momentum_tensor - 1.0) + scaled_g) else: update = scaled_g return tf.assign_sub(var, self._learning_rate_tensor * update)
def __init__(self, src_vocab_size, trg_vocab_size, buckets, size, num_layers, batch_size, mode, input_keep_prob, output_keep_prob, state_keep_prob, beam_search, beam_size, schedule_sampling='linear', sampling_decay_rate=0.99, sampling_global_step=150000, sampling_decay_steps=500, pretrain_vec=None, pretrain_trainable=False, length_penalty=None, length_penalty_factor=0.6): self.src_vocab_size = src_vocab_size self.trg_vocab_size = trg_vocab_size self.buckets = buckets # units of rnn cell self.size = size # dimension of words self.num_layers = num_layers self.batch_size = batch_size self.learning_rate = tf.Variable(0.5, trainable=False) self.mode = mode self.dummy_reply = ["what ?", "yeah .", "you are welcome ! ! ! !"] # learning rate decay self.learning_rate_decay = self.learning_rate.assign( self.learning_rate * 0.99) # input for Reinforcement part self.loop_or_not = tf.placeholder(tf.bool) self.reward = tf.placeholder(tf.float32, [None]) batch_reward = tf.stop_gradient(self.reward) self.RL_index = [None for _ in self.buckets] # dropout self.input_keep_prob = input_keep_prob self.output_keep_prob = output_keep_prob self.state_keep_prob = state_keep_prob # beam search self.beam_search = beam_search self.beam_size = beam_size self.length_penalty = length_penalty self.length_penalty_factor = length_penalty_factor # if load pretrain word vector self.pretrain_vec = pretrain_vec self.pretrain_trainable = pretrain_trainable # schedule sampling self.sampling_probability_clip = None self.schedule_sampling = schedule_sampling if self.schedule_sampling == 'False': self.schedule_sampling = False self.init_sampling_probability = 1.0 self.sampling_global_step = sampling_global_step self.sampling_decay_steps = sampling_decay_steps self.sampling_decay_rate = sampling_decay_rate if self.schedule_sampling == 'linear': self.decay_fixed = self.init_sampling_probability * ( self.sampling_decay_steps / self.sampling_global_step) with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) self.sampling_probability_decay = tf.assign_sub( self.sampling_probability, self.decay_fixed) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) #self.sampling_probability = tf.maximum(self.sampling_probability,tf.constant(0.0)) elif self.schedule_sampling == 'exp': with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) #self.sampling_probability = tf.train.exponential_decay( self.sampling_probability_decay = tf.assign( self.sampling_probability, tf.train.natural_exp_decay(self.sampling_probability, self.sampling_global_step, self.sampling_decay_steps, self.sampling_decay_rate, staircase=True)) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) elif self.schedule_sampling == 'inverse_sigmoid': with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE): self.sampling_probability = tf.get_variable( name=self.schedule_sampling, initializer=tf.constant(self.init_sampling_probability), trainable=False) self.sampling_probability_decay = tf.assign( self.sampling_probability, #tf.train.cosine_decay( tf.train.linear_cosine_decay( self.sampling_probability, self.sampling_decay_steps, self.sampling_global_step, )) self.sampling_probability_clip = tf.clip_by_value( self.sampling_probability, 0.0, 1.0) elif not self.schedule_sampling: pass else: raise ValueError( "schedule_sampling must be one of the following: [linear|exp|inverse_sigmoid|False]" ) w_t = tf.get_variable('proj_w', [self.trg_vocab_size, self.size]) w = tf.transpose(w_t) b = tf.get_variable('proj_b', [self.trg_vocab_size]) output_projection = (w, b) def sample_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) # num_classes:所有的wordvec維度; num_sampled:取樣後使用的維度來計算softmax。 return tf.cast(tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, inputs=local_inputs, labels=labels, num_sampled=512, num_classes=self.trg_vocab_size), dtype=tf.float32) softmax_loss_function = sample_loss #FIXME add RL function def seq2seq_multi(encoder_inputs, decoder_inputs, mode, pretrain_vec=None): if pretrain_vec is not None: pad_num = self.src_vocab_size - pretrain_vec.shape[0] pretrain_vec = np.pad(pretrain_vec, [(0, pad_num), (0, 0)], mode='constant') tag_vec = pretrain_vec[:data_utils.SPECIAL_TAGS_COUNT] pretrain_vec = pretrain_vec[data_utils.SPECIAL_TAGS_COUNT:] special_tags = tf.get_variable(name="special_tags", initializer=tag_vec, trainable=True) embedding = tf.get_variable(name="embedding", initializer=pretrain_vec, trainable=self.pretrain_trainable) embedding = tf.concat([special_tags, embedding], 0) else: embedding = tf.get_variable("embedding", [self.src_vocab_size, self.size]) loop_function_RL = None if mode == 'MLE': feed_previous = False elif mode == 'TEST': feed_previous = True # need loop_function elif mode == 'RL': feed_previous = True def loop_function_RL(prev, i): prev = tf.matmul( prev, output_projection[0]) + output_projection[1] prev_index = tf.multinomial(tf.log(tf.nn.softmax(prev)), 1) if i == 1: for index, RL in enumerate(self.RL_index): if RL is None: self.RL_index[index] = prev_index self.index = index break else: self.RL_index[self.index] = tf.concat( [self.RL_index[self.index], prev_index], axis=1) #self.RL_index: [(?,9),(?,14),(?,24),(?,49)] #RL_index指的是取樣後每個字的index prev_index = tf.reshape(prev_index, [-1]) #prev_index: (?,) # decide which to be the next time step input sample = tf.nn.embedding_lookup(embedding, prev_index) #sample: (?,256) from_decoder = tf.nn.embedding_lookup( embedding, decoder_inputs[i]) #from_decoder: (?,256) return tf.where(self.loop_or_not, sample, from_decoder) self.loop_function_RL = loop_function_RL return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=self.src_vocab_size, num_decoder_symbols=self.trg_vocab_size, embedding_size=self.size, output_projection=output_projection, feed_previous=feed_previous, dtype=tf.float32, embedding=embedding, beam_search=self.beam_search, beam_size=self.beam_size, loop=loop_function_RL, schedule_sampling=self.schedule_sampling, sampling_probability=self.sampling_probability_clip, length_penalty=self.length_penalty, length_penalty_factor=self.length_penalty_factor) # inputs self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name='encoder{0}'.format(i))) for i in range(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name='decoder{0}'.format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name='weight{0}'.format(i))) targets = [ self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1) ] def single_cell(): return tf.contrib.rnn.GRUCell(self.size) #return tf.contrib.rnn.BasicLSTMCell(self.size) cell = single_cell() if self.num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell( [single_cell() for _ in range(self.num_layers)]) cell = rnn.DropoutWrapper(cell, input_keep_prob=self.input_keep_prob, output_keep_prob=self.output_keep_prob, state_keep_prob=self.state_keep_prob) if self.mode == 'MLE': self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec), softmax_loss_function=softmax_loss_function) for b in range(len(self.buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] self.update = [] optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) for b in range(len(self.buckets)): gradients = tf.gradients(self.losses[b], tf.trainable_variables()) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.update.append( optimizer.apply_gradients( zip(clipped_gradients, tf.trainable_variables()))) elif self.mode == 'TEST': self.buckets = [(10, 50), (15, 50), (25, 50), (50, 50)] self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec), softmax_loss_function=softmax_loss_function) for b in range(len(self.buckets)): #print('self.outputs[b]: ',self.outputs[b]) self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] #print('self.outputs[b]: ',self.outputs[b]) elif self.mode == 'RL': self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec), softmax_loss_function=softmax_loss_function, per_example_loss=True) #print('self.buckets: ',len(self.buckets)) for b in range(len(self.buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] #print('self.RL_index: ',self.RL_index) #print('self.outputs: ',len(self.outputs[0]),len(self.outputs[1]),len(self.outputs[2]),len(self.outputs[3])) #print('self.RL_index: ',len(self.RL_index)) #print('self.outputs: ',len(self.outputs)) for i, b in enumerate(self.outputs): prev_index = tf.multinomial( tf.log(tf.nn.softmax(b[self.buckets[i][1] - 1])), 1) #下面一行目的為補足最後一個decoder output,因為在decoder當中呼叫一次loop_function,RL_index才會append一次,但最後一個input得到的output不會再當prev丟入下一個loop_function,因此要從self.outputs的最後一個物件來補齊。 self.RL_index[i] = tf.concat([self.RL_index[i], prev_index], axis=1) #print(i,len(b)) #print('self.buckets: ',self.buckets) #print('self.buckets[i][1]: ',self.buckets[i][1]) #print('self.buckets[i][1] - 1: ',self.buckets[i][1] - 1) #print('b[self.buckets[i][1] - 1]: ', b[self.buckets[i][1] - 1]) #print('prev_index: ',prev_index) #print('self.RL_index[i]: ',self.RL_index[i]) #print('----------------') #self.outputs: list of 4 buckets, each (?,6258) #print('self.RL_index: ',self.RL_index) self.update = [] optimizer = tf.train.GradientDescentOptimizer(0.01) #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) for b in range(len(self.buckets)): scaled_loss = tf.multiply(self.losses[b], batch_reward) self.losses[b] = tf.reduce_mean(scaled_loss) gradients = tf.gradients(self.losses[b], tf.trainable_variables()) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.update.append( optimizer.apply_gradients( zip(clipped_gradients, tf.trainable_variables()))) # specify saver self.saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
class KalmanFilter: def __init__(self): pass # Const Params with tf.variable_scope("kf_constants"): F = tf.constant([ [1, 0, 0.2, 0], [0, 1, 0, 0.2], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=tf.float32, name="kf_F") B = tf.constant([ [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=tf.float32, name="kf_B") H = tf.constant([ [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=tf.float32, name="kf_H") Q = tf.constant([ [0.001, 0, 0, 0], [0, 0.001, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], dtype=tf.float32, name="kf_Q") R = tf.constant([ [0.1, 0, 0, 0], [0, 0.1, 0, 0], [0, 0, 0.1, 0], [0, 0, 0, 0.1]], dtype=tf.float32, name="kf_R") # Inputs and Outputs with tf.variable_scope("kf_inputs_outputs"): x0 = tf.placeholder(dtype=tf.float32, shape=(4, 1), name="kf_x0") # Last coordinates P = tf.Variable([ [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], dtype=tf.float32, name="kf_P") # 4 dynamic parameter: coordinates and velocity # Predict with tf.variable_scope("kf_predict"): xhat = tf.Variable([ [0], [0], [0], [0]], dtype=tf.float32, name="kf_xhat") predict_xhat = tf.assign(xhat, tf.matmul(F, x0), name="kf_predict_xhat") predict_P = tf.assign(P, tf.matmul(F, tf.matmul(P, F, transpose_b=True)) + Q, name="kf_predict_P") # Correction with tf.variable_scope("kf_correction"): S = tf.matmul(H, tf.matmul(P, H, transpose_b=True)) + R K = tf.matmul(tf.matmul(P, H, transpose_b=True), tf.matrix_inverse(S)) z = tf.matmul(H, x0, name="kf_z") y1 = z - tf.matmul(H, xhat) update_xhat = tf.assign_add(xhat, tf.matmul(K, y1), name="kf_update_xhat") delta_P = tf.matmul(K, tf.matmul(H, P)) update_P = tf.assign_sub(P, delta_P, name="kf_update_P") init = tf.global_variables_initializer()
def _apply_dense(self, grad: tf.Tensor, var: tf.Variable) -> tf.Operation: """Add ops to apply dense gradients to `var`. Args: grad: A gradient `Tensor`. var: A `Variable` object. Returns: An `Operation`. """ alpha_t = tf.cast(self._alpha_t, var.dtype.base_dtype) lr_update_t = tf.cast(self._lr_update_t, var.dtype.base_dtype) lr_max_t = tf.cast(self._lr_max_t, var.dtype.base_dtype) lr_min_t = tf.cast(self._lr_min_t, var.dtype.base_dtype) steps = self._steps_t current_step = self._current_step global_step = self._global_step gk_old = self.get_slot(var, "gk_old") gk = self.get_slot(var, "gk") var_old = self.get_slot(var, "v_old") lr = self.get_slot(var, "lr") noise = tf.random_uniform(shape=tf.shape(var), minval=-1.0, maxval=+1.0) if self._norm == 'max': # compute normalization constant g_max = tf.reduce_max(tf.abs(grad)) denominator = _EPSILON + g_max g_update_normed = grad / denominator else: g_update_normed = tf.nn.l2_normalize(grad) # compute update grad update_grad = lr * (g_update_normed + noise * alpha_t) var_update = tf.assign_sub(var, update_grad) beta = 0.9 def update_grads(): agg_grad = gk * beta + (1 - beta) * update_grad # agg_grad = gk + update_grad update_gk = tf.assign(gk, agg_grad) return tf.group([update_gk]), lr def reset_steps(): agg_grad = gk * beta + (1 - beta) * update_grad # I did try it however it was not stable :/ # dx = var - var_old # dg = gk - gk_old # s1 = tf.reduce_sum(tf.square(dx)) # s2 = tf.abs(tf.reduce_sum(dx * dg)) + _EPSILON # eta = s1 / s2 # update learning rate g_normed = tf.nn.l2_normalize(agg_grad) old_g_normed = tf.nn.l2_normalize(gk_old) lr_change = -lr_update_t * tf.reduce_sum(g_normed * old_g_normed) eta = lr * (1 - lr_change) with tf.control_dependencies([eta]): update_gk_old = tf.assign(gk_old, agg_grad) with tf.control_dependencies([update_gk_old]): update_gk = tf.assign(gk, tf.zeros_like(gk)) update_var_old = tf.assign(var_old, var) step_assign = tf.assign(current_step, 0) update_g = tf.group( [update_gk_old, update_var_old, update_gk, step_assign]) return update_g, eta with tf.control_dependencies([var_update]): udaptes, new_lr = tf.cond(tf.greater_equal(current_step, steps), true_fn=reset_steps, false_fn=update_grads) with tf.control_dependencies([udaptes]): new_lr = tf.cond(tf.greater_equal( tf.to_float(global_step) / tf.to_float(steps), 2), true_fn=lambda: new_lr, false_fn=lambda: lr) global_step_update = tf.assign_add(global_step, 1) step_update = tf.assign_add(current_step, 1) new_lr = tf.clip_by_value(new_lr, lr_min_t, lr_max_t) lr_update = tf.assign(lr, new_lr) update = tf.group([lr_update, step_update, global_step_update]) return update
def testAssignUpdateNoShape(self): var = state_ops.variable_op([1, 2], tf.float32, set_shape=False) added = tf.assign_add(var, self._NewShapelessTensor()) self.assertEqual(tensor_shape.unknown_shape(), added.get_shape()) subbed = tf.assign_sub(var, self._NewShapelessTensor()) self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
def sgd_update(grad, var, lr): delta = lr * grad return tf.assign_sub(var, delta)
def _eval_mean_update(): difference = (1 - eval_mean_ema_decay) * (eval_mean - training_mean) return tf.assign_sub(eval_mean, difference)
def apply_updates(self): assert not self._updates_applied self._updates_applied = True devices = list(self._dev_grads.keys()) total_grads = sum(len(grads) for grads in self._dev_grads.values()) assert len(devices) >= 1 and total_grads >= 1 ops = [] with absolute_name_scope(self.scope): # Cast gradients to FP32 and calculate partial sum within each device. dev_grads = OrderedDict() # device => [(grad, var), ...] for dev_idx, dev in enumerate(devices): with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev): sums = [] for gv in zip(*self._dev_grads[dev]): assert all(v is gv[0][1] for g, v in gv) g = [tf.cast(g, tf.float32) for g, v in gv] g = g[0] if len(g) == 1 else tf.add_n(g) sums.append((g, gv[0][1])) dev_grads[dev] = sums # Sum gradients across devices. if len(devices) > 1: with tf.name_scope('SumAcrossGPUs'), tf.device(None): for var_idx, grad_shape in enumerate(self._grad_shapes): g = [dev_grads[dev][var_idx][0] for dev in devices] if np.prod(grad_shape): # nccl does not support zero-sized tensors g = tf.contrib.nccl.all_sum(g) for dev, gg in zip(devices, g): dev_grads[dev][var_idx] = (gg, dev_grads[dev][var_idx][1]) # Apply updates separately on each device. for dev_idx, (dev, grads) in enumerate(dev_grads.items()): with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev): # Scale gradients as needed. if self.use_loss_scaling or total_grads > 1: with tf.name_scope('Scale'): coef = tf.constant(np.float32(1.0 / total_grads), name='coef') coef = self.undo_loss_scaling(coef) grads = [(g * coef, v) for g, v in grads] # Check for overflows. with tf.name_scope('CheckOverflow'): grad_ok = tf.reduce_all(tf.stack([tf.reduce_all(tf.is_finite(g)) for g, v in grads])) # Update weights and adjust loss scaling. with tf.name_scope('UpdateWeights'): opt = self._dev_opt[dev] ls_var = self.get_loss_scaling_var(dev) if not self.use_loss_scaling: ops.append(tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op)) else: ops.append(tf.cond(grad_ok, lambda: tf.group(tf.assign_add(ls_var, self.loss_scaling_inc), opt.apply_gradients(grads)), lambda: tf.group(tf.assign_sub(ls_var, self.loss_scaling_dec)))) # Report statistics on the last device. if dev == devices[-1]: with tf.name_scope('Statistics'): ops.append(autosummary(self.id + '/learning_rate', self.learning_rate)) ops.append(autosummary(self.id + '/overflow_frequency', tf.where(grad_ok, 0, 1))) if self.use_loss_scaling: ops.append(autosummary(self.id + '/loss_scaling_log2', ls_var)) # Initialize variables and group everything into a single op. self.reset_optimizer_state() init_uninited_vars(list(self._dev_ls_var.values())) return tf.group(*ops, name='TrainingOp')
def resnet_model_fn(features, labels, mode, params): """Our model_fn for ResNet to be used with our Estimator.""" tf.summary.image('images', features, max_outputs=6) # build model net = resnet.ResNet(features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) logits = net.build_model() predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate loss, which includes softmax cross entropy and L2 regularization. # a. get loss coeficiente pos_mask = tf.reduce_sum( tf.cast( tf.greater_equal( labels, tf.fill(tf.shape(labels), FLAGS.mask_thres)), tf.float32), 0) pos_curr_count = tf.cast(tf.greater( pos_mask, 0), tf.float32) neg_curr_count = tf.cast(tf.less_equal(pos_mask, 0), tf.float32) pos_count = tf.Variable(tf.zeros(shape=[FLAGS.class_num,]), trainable=False) neg_count = tf.Variable(tf.zeros(shape=[FLAGS.class_num,]), trainable=False) neg_select = tf.cast( tf.less_equal( tf.random_uniform( shape=[FLAGS.class_num,], minval=0, maxval=1, seed = FLAGS.random_seed), FLAGS.neg_select), tf.float32) tf.summary.histogram('pos_curr_count', pos_curr_count) tf.summary.histogram('neg_curr_count', neg_curr_count) tf.summary.histogram('neg_select', neg_select) with tf.control_dependencies([pos_curr_count, neg_curr_count, neg_select]): pos_count = tf.assign_sub( tf.assign_add(pos_count, pos_curr_count), tf.multiply(pos_count, neg_curr_count)) neg_count = tf.assign_sub( tf.assign_add(neg_count, tf.multiply(neg_curr_count, neg_select)), tf.multiply(neg_count, pos_curr_count)) tf.summary.histogram('pos_count', pos_count) tf.summary.histogram('neg_count', neg_count) pos_loss_coef = -1 * (tf.log((0.01 + pos_count)/10)/tf.log(10.0)) pos_loss_coef = tf.where( tf.greater(pos_loss_coef, tf.fill(tf.shape(pos_loss_coef), 0.01)), pos_loss_coef, tf.fill(tf.shape(pos_loss_coef), 0.01)) pos_loss_coef = tf.multiply(pos_loss_coef, pos_curr_count) tf.summary.histogram('pos_loss_coef', pos_loss_coef) neg_loss_coef = -1 * (tf.log((8 + neg_count)/10)/tf.log(10.0)) neg_loss_coef = tf.where( tf.greater(neg_loss_coef, tf.fill(tf.shape(neg_loss_coef), 0.01)), neg_loss_coef, tf.fill(tf.shape(neg_loss_coef), 0.001)) neg_loss_coef = tf.multiply(neg_loss_coef, tf.multiply(neg_curr_count, neg_select)) tf.summary.histogram('neg_loss_coef', neg_loss_coef) loss_coef = tf.add(pos_loss_coef, neg_loss_coef) tf.summary.histogram('loss_coef', loss_coef) # b. get non-negative mask non_neg_mask = tf.fill(tf.shape(labels), -1.0, name='non_neg') non_neg_mask = tf.cast(tf.not_equal(labels, non_neg_mask), tf.float32) tf.summary.histogram('non_neg', non_neg_mask) # cal loss cross_entropy = tf.nn.weighted_cross_entropy_with_logits( logits=logits, targets=labels, pos_weight=12, name='sigmod_cross_entropy') tf.summary.histogram('sigmod_ce', cross_entropy) cross_entropy_cost = tf.reduce_sum(tf.reduce_mean(cross_entropy * non_neg_mask, axis=0) * loss_coef) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy_cost, name='cross_entropy') tf.summary.scalar('cross_entropy', cross_entropy_cost) # Add weight decay to the loss. We exclude the batch norm variables because # doing so leads to a small improvement in accuracy. loss = cross_entropy_cost + FLAGS.weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name]) if mode == tf.estimator.ModeKeys.TRAIN: # Scale the learning rate linearly with the batch size. When the batch size # is 256, the learning rate should be 0.1. lr_warmup = FLAGS.lr_warmup warmup_step = FLAGS.warmup warmup_decay_step = FLAGS.lr_warmup_decay_step warmup_decay_factor = FLAGS.lr_warmup_decay_factor global_step = tf.train.get_or_create_global_step() boundaries = [ int(FLAGS.lr_decay_step * epoch) for epoch in [1, 2, 3, 4]] values = [ FLAGS.lr * decay for decay in [1, 0.1, 0.01, 1e-3, 1e-4]] learning_rate = tf.train.piecewise_constant( tf.cast(global_step, tf.int32), boundaries, values) # Linear Scaling Rule and Gradual Warmup lr = tf.cond( global_step < warmup_step, lambda: tf.train.exponential_decay( lr_warmup, global_step, warmup_decay_step, warmup_decay_factor, staircase=True ), lambda: learning_rate ) # Create a tensor named learning_rate for logging purposes. tf.identity(lr, name='learning_rate') tf.summary.scalar('learning_rate', lr) optimizer = tf.train.MomentumOptimizer( learning_rate=lr, momentum=FLAGS.opt_momentum) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) else: train_op = None # Build evaluate metrics accuracy = tf.metrics.accuracy( tf.argmax(labels, axis=1), predictions['classes']) metrics = {'accuracy': accuracy} tf.identity(accuracy[1], name='train_accuracy') tf.summary.scalar('train_accuracy', accuracy[1]) return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metrics)
def __init__(self, scope, trainer, global_step=None): with tf.variable_scope(scope): self.prob_of_random_goal = tf.Variable( FLAGS.initial_random_goal_prob, trainable=False, name="prob_of_random_goal", dtype=tf.float32) self.inputs = tf.placeholder(shape=[ None, FLAGS.resized_height, FLAGS.resized_width, FLAGS.agent_history_length ], dtype=tf.float32, name="Inputs") self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards") self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards, dtype=tf.int32), 2, dtype=tf.float32, name="Prev_Rewards_OneHot") self.prev_rewards = tf.expand_dims(self.prev_rewards, 1, name="rewards") # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0) self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions") self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32, name="Prev_Actions_OneHot") self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_Goals") self.image_summaries = [] if FLAGS.game not in flags.SUPPORTED_ENVS: self.conv0 = tf.contrib.layers.conv2d(self.inputs, 16, 8, 4, activation_fn=tf.nn.elu, scope="conv0") with tf.variable_scope('conv0'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) self.conv = tf.contrib.layers.conv2d(self.conv0, 32, 4, 2, activation_fn=tf.nn.elu, scope="conv1") else: self.conv = tf.contrib.layers.conv2d(self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1") with tf.variable_scope('conv1'): tf.get_variable_scope().reuse_variables() weights = tf.get_variable('weights') grid = self.put_kernels_on_grid(weights) self.image_summaries.append( tf.summary.image('kernels', grid, max_outputs=1)) with tf.variable_scope('inputs'): tf.get_variable_scope().reuse_variables() self.image_summaries.append( tf.summary.image('input', self.inputs, max_outputs=100)) self.conv_flat = tf.contrib.layers.flatten(self.conv) self.fc = tf.contrib.layers.fully_connected( self.conv_flat, FLAGS.hidden_dim) self.fc = tf.contrib.layers.layer_norm(self.fc) self.f_percept = tf.nn.elu(self.fc, name="Zt") if FLAGS.game not in flags.SUPPORTED_ENVS: self.f_percept = tf.concat([self.f_percept, self.prev_rewards], 1, name="Zt_r") else: self.f_percept = tf.concat( [self.f_percept, self.prev_rewards_onehot], 1, name="Zt_r") summary_f_percept_act = tf.contrib.layers.summarize_activation( self.f_percept) ############################################################################################################ # Manager network if FLAGS.meta: self.f_Mspace = tf.concat([self.f_percept, self.prev_goal], 1, name="Zt_r") else: self.f_Mspace = tf.identity(self.f_percept, name="Zt_r") self.f_Mspace = tf.contrib.layers.fully_connected( self.f_Mspace, FLAGS.hidden_dim) self.f_percept = tf.concat( [self.f_percept, self.prev_actions_onehot], 1, name="Zt_r") self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace) self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St") summary_f_Mspace_act = tf.contrib.layers.summarize_activation( self.f_Mspace) m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in") step_size = tf.shape(self.inputs)[:1] m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( FLAGS.hidden_dim) m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32) self.m_state_init = [m_c_init, m_h_init] m_c_in = tf.placeholder( tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_c_in") m_h_in = tf.placeholder( tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_h_in") self.m_state_in = (m_c_in, m_h_in) m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in) m_lstm_outputs, m_lstm_state = self.fast_dlstm( m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon, FLAGS.hidden_dim * FLAGS.manager_horizon) m_lstm_c, m_lstm_h = m_lstm_state self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :]) self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim]) self.normalized_goals = tf.contrib.layers.fully_connected( self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt") summary_goals = tf.contrib.layers.summarize_activation( self.normalized_goals) def randomize_goals(t): t = tf.cast(t, tf.int32) packed_tensors = tf.stack([ tf.random_normal([ FLAGS.hidden_dim, ]), self.normalized_goals[t, :] ]) to_update = tf.cond( tf.less( self.prob_of_random_goal, tf.constant(FLAGS.final_random_goal_prob, dtype=tf.float32)), lambda: tf.cast( tf.multinomial( tf.log([[ self.prob_of_random_goal, tf.subtract(tf.constant(1.0), self. prob_of_random_goal) ]]), 1)[0][0], tf.int32), lambda: tf.constant(1, tf.int32)) resulted_tensor = tf.gather(packed_tensors, to_update) return resulted_tensor self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t), tf.to_float( tf.range(0, step_size[0])), name="random_gt") summary_random_goals = tf.contrib.layers.summarize_activation( self.randomized_goals) self.decrease_prob_of_random_goal = tf.assign_sub( self.prob_of_random_goal, tf.constant( (FLAGS.initial_random_goal_prob - FLAGS.final_random_goal_prob) / FLAGS.explore_steps)) m_fc_value_w = tf.get_variable( "M_Value_W", shape=[FLAGS.hidden_dim, 1], initializer=normalized_columns_initializer(1.0)) self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value") summary_m_value_act = tf.contrib.layers.summarize_activation( self.m_value) ############################################################################################################ # Worker network self.sum_prev_goals = tf.placeholder( shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_c_Goals_sum") w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in") step_size = tf.shape(self.inputs)[:1] w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( FLAGS.goal_embedding_size * FLAGS.nb_actions) w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32) w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32) self.w_state_init = [w_c_init, w_h_init] w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c], name="Wrnn_c_in") w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h], name="Wrnn_h_in") self.w_state_in = (w_c_in, w_h_in) w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in) w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn( w_lstm_cell, w_rnn_in, initial_state=w_state_in, sequence_length=step_size, time_major=False) w_lstm_c, w_lstm_h = w_lstm_state self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :]) Ut = tf.reshape( w_lstm_outputs, [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size], name="Ut") Ut_flat = tf.reshape( w_lstm_outputs, [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size], name="Ut_flat") summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut) goal_encoding = tf.contrib.layers.fully_connected( self.sum_prev_goals, FLAGS.goal_embedding_size, biases_initializer=None, scope="goal_emb") interm_rez = tf.squeeze( tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2) interm_rez = tf.contrib.layers.flatten(interm_rez) self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy") summary_w_policy_act = tf.contrib.layers.summarize_activation( self.w_policy) w_fc_value_w = tf.get_variable( "W_Value_W", shape=[ FLAGS.nb_actions * FLAGS.goal_embedding_size + FLAGS.goal_embedding_size, 1 ], initializer=normalized_columns_initializer(1.0)) self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1), w_fc_value_w, name="W_Value") summary_w_value_act = tf.contrib.layers.summarize_activation( self.w_value) if scope != 'global': self.w_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.m_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) self.w_intrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32) def gather_state_at_horiz(t): t = tf.cast(t, tf.int32) f_Mspace_c = tf.gather( self.f_Mspace, tf.minimum( t + tf.constant(FLAGS.manager_horizon, dtype=tf.int32), step_size[0] - 1)) return f_Mspace_c self.f_Mspace_c = tf.cast(tf.map_fn( lambda t: gather_state_at_horiz(t), tf.to_float(tf.range(0, step_size[0])), name="state_at_horiz"), dtype=tf.float32) self.state_diff = self.f_Mspace_c - self.f_Mspace self.cos_sim_state_diff = self.cosine_distance( tf.stop_gradient(self.state_diff), self.normalized_goals, dim=1) self.m_advantages = self.m_extrinsic_return - tf.stop_gradient( tf.reshape(self.m_value, [-1])) self.goals_loss = -tf.reduce_sum( self.m_advantages * self.cos_sim_state_diff) self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum( tf.square(self.m_extrinsic_return - tf.reshape(self.m_value, [-1]))) self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32, name="Actions_Onehot") self.responsible_outputs = tf.reduce_sum( self.w_policy * self.actions_onehot, [1]) self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return self.total_return = self.w_extrinsic_return + self.intrinsic_return self.w_advantages = self.total_return - tf.stop_gradient( tf.reshape(self.w_value, [-1])) # Loss functions self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum( tf.square(self.total_return - tf.reshape(self.w_value, [-1]))) self.entropy = -tf.reduce_sum( self.w_policy * tf.log(self.w_policy + 1e-7)) self.w_policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs + 1e-7) * self.w_advantages) - self.entropy * FLAGS.beta_e self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, FLAGS.gradient_clip_value) self.worker_summaries = [ summary_f_percept_act, summary_f_Mspace_act, summary_goals, summary_random_goals, summary_m_value_act, summary_wrnn_act, summary_w_policy_act, summary_w_value_act ] for grad, weight in zip(grads, local_vars): self.worker_summaries.append( tf.summary.histogram(weight.name + '_grad', grad)) self.worker_summaries.append( tf.summary.histogram(weight.name, weight)) self.merged_summary = tf.summary.merge(self.worker_summaries) global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))
def apply_gradients(self, grads_and_vars, global_step=None, name=None): d_vars = [] g_vars = [] d_grads = [] g_grads = [] for grad,var in grads_and_vars: if var in self.gan.d_vars(): d_vars += [var] d_grads += [grad] elif var in self.gan.g_vars(): g_vars += [var] g_grads += [grad] else: raise("Couldn't find var in g_vars or d_vars") var_list = d_vars + g_vars with ops.init_scope(): slots_list = [] if self.config.include_slots: for name in self.optimizer.get_slot_names(): for var in self.optimizer.variables(): slots_list.append(self._zeros_slot(var, "curl", "curl")) self._prepare() def _name(post, s): ss = s.split(":") return ss[0] + "_" + post + "_dontsave" v1 = [tf.Variable(v, name=_name("curl",v.name)) for v in var_list] slots_list = [] slots_vars = [] if self.config.include_slots: for name in self.optimizer.get_slot_names(): for var in self.optimizer.variables(): slots_vars += [var] slots_list.append(self._zeros_slot(var, "curl", "curl")) restored_vars = var_list + slots_vars tmp_vars = v1 + slots_list # store variables for resetting if self.config.beta_type == 'sga': Jgrads = tf.gradients(d_grads, d_vars, grad_ys=d_grads, stop_gradients=d_vars) + [tf.zeros_like(g) for g in g_vars] elif self.config.beta_type == 'magnitude': consensus_reg = [tf.square(g) for g in d_grads if g is not None] Jgrads = tf.gradients(consensus_reg, d_vars) + [tf.zeros_like(g) for g in g_vars] else: consensus_reg = 0.5 * sum( tf.reduce_sum(tf.square(g)) for g in d_grads if g is not None ) Jgrads = tf.gradients(consensus_reg, d_vars, stop_gradients=d_vars) + [tf.zeros_like(g) for g in g_vars] g1s = d_grads + g_grads op1 = tf.group(*[tf.assign(w, v) for w,v in zip(tmp_vars, restored_vars)]) # store variables with tf.get_default_graph().control_dependencies([op1]): # store g2 op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in grads_and_vars]) with tf.get_default_graph().control_dependencies([op3]): def curlcombine(g1,g2,_v1,_v2,curl,rho): stepsize = self._lr_t if curl == "mirror": return self._gamma*(g1 + 2*g2) else: return self._gamma*g1-rho*(g2-g1)/stepsize g2s = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) if self.config.form == 'central': def central_step(): # restore v1, slots op5 = tf.group(*[ tf.assign(w,v) for w,v in zip(restored_vars, tmp_vars)]) with tf.get_default_graph().control_dependencies([op5]): back = tf.group(*[tf.assign_sub(v, -self._lr_t*grad) for grad,v in grads_and_vars]) with tf.get_default_graph().control_dependencies([back]): return tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) def curlcombinecentral(g1,g2,_v1,_v2,curl,rho): #stepsize = (_v2-_v1)/(g1+1e-8) stepsize = self._lr_t if curl == "mirror": return self._gamma*(g1 + 2*g2) else: return self._gamma*g1-rho*(g2-g1)/(2*stepsize) g1s = central_step() g3s = [curlcombinecentral(g1,g2,v1,v2,self.config.d_curl,self.d_rho) if v2 in d_vars else curlcombinecentral(g1,g2,v1,v2,self.config.g_curl,self.g_rho) for g1,g2,v1,v2 in zip(g1s,g2s,v1,var_list)] else: #forward g3s = [curlcombine(g1,g2,v1,v2,self.config.d_curl,self.d_rho) if v2 in d_vars else curlcombine(g1,g2,v1,v2,self.config.g_curl,self.g_rho) for g1,g2,v1,v2 in zip(g1s,g2s,v1,var_list)] # restore v1, slots op5 = tf.group(*[ tf.assign(w,v) for w,v in zip(restored_vars, tmp_vars)]) with tf.get_default_graph().control_dependencies([op5]): flin = [] for grad, jg in zip(g3s, Jgrads): if jg is None or self._beta <= 0: flin += [grad] else: flin += [grad + jg * self._beta] if self.config.orthonormal: shapes = [self.gan.ops.shape(l) for l in flin] u = [tf.reshape(l, [-1]) for l in flin[:len(d_vars)]] v = [tf.reshape(l, [-1]) for l in Jgrads[:len(d_vars)]] def proj(u, v,shape): dot = tf.tensordot(v, u, 1) / (tf.square(u)+1e-8) dot = tf.maximum(-1.0, dot) dot = tf.minimum(1.0, dot) dot = dot * u dot = tf.reshape(dot, shape) return dot proj_u1_v2 = [proj(_u, _v, _s) for _u, _v, _s in zip(u, v, shapes)] flin = [_flin + self.gan.configurable_param(self.config.ortholambda) * proj for _flin, proj in zip(flin, proj_u1_v2)] + flin[len(d_vars):] step3 = list(zip(flin, var_list)) op6 = self.optimizer.apply_gradients(step3.copy(), global_step=global_step, name=name) with tf.get_default_graph().control_dependencies([op6]): return tf.no_op()
def __init__(self, particles, cost_fun, tf_scope="default", batch_generator=None, stepsize_schedule=ConstantStepsizeSchedule(0.1), alpha=0.9, fudge_factor=1e-6, session=tf.get_default_session(), dtype=tf.float64, seed=None): """ Initialize the sampler parameters and set up a tensorflow.Graph for later queries. Parameters ---------- particles : List[tensorflow.Variable] List of particles each representing a (different) guess of the target parameters of this sampler. cost_fun : callable Function that takes `params` of *one* particle as input and returns a 1-d `tensorflow.Tensor` that contains the cost-value. Frequently denoted with `U` in literature. batch_generator : iterable, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule Iterator class that produces a stream of stepsize values that we can use in our samplers. See also: `pysgmcmc.stepsize_schedules` alpha : float, optional TODO DOKU Defaults to `0.9`. fudge_factor : float, optional TODO DOKU Defaults to `1e-6`. session : tensorflow.Session, optional Session object which knows about the external part of the graph (which defines `Cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. seed : int, optional Random seed to use. Defaults to `None`. See Also ---------- pysgmcmc.sampling.MCMCSampler: Base class for `SteinVariationalGradientDescentSampler` that specifies how actual sampling is performed (using iterator protocol, e.g. `next(sampler)`). """ assert isinstance(alpha, (int, float)) assert isinstance(fudge_factor, (int, float)) # assert callable(cost_fun) # self.particles = tf.stack(particles) self.particles = particles # def cost_fun_wrapper(params): # return tf.map_fn(lambda particle: cost_fun(particle), self.particles) # cost_fun_wrapper.__name__ = "potential_energy" # cost_fun.__name__ # super().__init__( self._init_basic( params=particles, cost_fun=cost_fun, # cost_fun_wrapper, tf_scope=tf_scope, batch_generator=batch_generator, session=session, seed=seed, dtype=dtype, stepsize_schedule=stepsize_schedule ) with tf.variable_scope(tf_scope, reuse=tf.AUTO_REUSE): fudge_factor = tf.constant( fudge_factor, dtype=self.dtype, name="fudge_factor" ) self.epsilon = tf.Variable( stepsize_schedule.initial_value, dtype=self.dtype, name="stepsize" ) stack_vectorized_params = tf.stack(self.vectorized_params) self.n_particles = tf.cast( # self.particles.shape[0], self.dtype stack_vectorized_params.shape[0], self.dtype ) historical_grad = tf.get_variable( "historical_grad", stack_vectorized_params.shape, dtype=dtype, initializer=tf.zeros_initializer() ) self.session.run( tf.variables_initializer([historical_grad, self.epsilon]) ) # lnpgrad = tf.squeeze(tf.gradients(self.cost, self.particles)) grads = [] for i, cost in enumerate(cost_fun): grads.append(tf.concat([vectorize(gradient) for gradient in tf.gradients(cost, self.particles[i])], axis=0)) lnpgrad = tf.squeeze(grads) kernel_matrix, kernel_gradients = self.svgd_kernel(stack_vectorized_params) # self.svgd_kernel(self.particles) grad_theta = tf.divide( tf.matmul(kernel_matrix, lnpgrad) + kernel_gradients, self.n_particles ) historical_grad_t = tf.assign( historical_grad, alpha * historical_grad + (1. - alpha) * (grad_theta ** 2) ) adj_grad = tf.divide( grad_theta, fudge_factor + tf.sqrt(historical_grad_t) ) for i, particle in enumerate(self.particles): vectorized_Theta_t = tf.assign_sub( self.vectorized_params[i], self.epsilon * adj_grad[i] ) start_idx = 0 for j, param in enumerate(particle): flat_shape = tf.reduce_prod(param.shape) vectorized_param = vectorized_Theta_t[start_idx:start_idx+flat_shape] self.theta_t[i*len(particle) + j] = tf.assign( param, tf.reshape(vectorized_param, shape=param.shape), name="theta_t_%d_%d" % (i, j) ) start_idx += flat_shape return
def build_graph(hp): # We build tensorflow graph of loaded model with parameter update operations # This model's trainable variables are error values on testset batch hp['dnn_hp']['save_dir'] = hp['trained_model_dir'] dnn.DNN(utils.CIFAR_INPUT_SIZE, N_CLASSES, hp['dnn_hp']) dnn_saver = tf.train.Saver(var_list=tf.global_variables()) with tf.variable_scope('DNN/EGT', reuse=tf.AUTO_REUSE): # Compute loss on test batch from 'learnable_error' parameter and actual prediction # Note that learnable_error is initialized to zero, which means the intial update is equivalent to making NN learn its own output g = tf.get_default_graph() test_X_ph = g.get_tensor_by_name("DNN/X:0") trained_variables = [ v for v in tf.global_variables() if v.name[:4] == 'DNN/' ] learnable_error = tf.Variable(tf.zeros([hp['batch_size'], N_CLASSES], tf.float32), name='learnable_error') tf.summary.histogram('learnable_error', learnable_error) logits = g.get_tensor_by_name('DNN/output_layer/logits:0') probs = g.get_tensor_by_name("DNN/probs:0") log_probs = tf.nn.log_softmax(logits, name='log_probs') new_probs = tf.nn.softmax(logits + learnable_error, name='new_loss') test_loss = tf.reduce_sum(log_probs * new_probs, name='test_loss') # Build updated graph train_X_ph = tf.placeholder(tf.float32, test_X_ph.get_shape(), name='X') opt = SGD(lr=hp['sub_lr'], momentum=hp['sub_momentum'], nesterov=True) with tf.variable_scope('adam_updates'): sub_updates = opt.get_updates(test_loss, trained_variables) updates_ops = tf.group(*sub_updates, name="updates_ops") replacements = utils.extract_update_dict(sub_updates) replacements[test_X_ph] = train_X_ph utils.graph_replace(test_loss, replacements, dst_scope='EGT/UpdatedDNN/', src_scope='DNN/') with tf.variable_scope('EGT/'): # Compute loss of updated graph on train batch train_y_ph = tf.placeholder(tf.int32, [None], name='y') updated_logits = g.get_tensor_by_name( "EGT/UpdatedDNN/output_layer/logits:0") xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=updated_logits, labels=train_y_ph, name='generalization_xentropy') generalization_loss = tf.reduce_mean(xentropy, name='generalization_loss') # We learn 'learnable_error' by backpropagating from 'generalization_loss' through sub-SGD update on test_loss # Note that this step is computationally expensive as this gradient computation needs second order derivative w.r.t model parameters # Thus, you either need to keep model capacity low or apply this technique on top layers of your model (TODO: implement this with hessian approximation?) lr = tf.constant(hp['lr'], name='lr') meta_gradients = tf.gradients(generalization_loss, learnable_error) meta_optimize = tf.assign_sub(learnable_error, lr * meta_gradients[0], name='optimize') init_ops = tf.variables_initializer( [v for v in tf.global_variables() if 'EGT/' in v.name]) return ( test_X_ph, train_X_ph, train_y_ph ), generalization_loss, test_loss, probs, new_probs, meta_optimize, updates_ops, init_ops, dnn_saver
def __init__(self, shape_list, seed_num=0): seed(seed_num) set_random_seed(seed_num) # Placeholders for input, output and dropout sequence_length = 28*28 num_classes = 10 self.shape_list = shape_list self.input_x = tf.placeholder(tf.float32, [None, sequence_length], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.lr_array = tf.placeholder(tf.float32, name="lr_array") self.alpha_array = tf.placeholder(tf.float32, name="alpha_array") initializer = tf.contrib.layers.xavier_initializer() # initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.1) with tf.name_scope("input"): self.P1 = tf.Variable(tf.eye(int(shape_list[0][0]))) w1 = tf.get_variable("w1", shape=shape_list[0], initializer=initializer) y_1 = tf.concat([self.input_x, tf.tile(tf.ones([1, 1]), [tf.shape(self.input_x)[0], 1])], 1) r = tf.reduce_mean(y_1, 0, keep_dims=True) k = tf.matmul(self.P1, tf.transpose(r)) self.dela_P1 = tf.divide(tf.matmul(k, tf.transpose(k)), self.alpha_array[0][0] + tf.matmul(r, k)) self.P1 = tf.assign_sub(self.P1, self.dela_P1) y1 = tf.nn.relu(tf.matmul(y_1, w1, name="y1")) with tf.name_scope("output"): self.P2 = tf.Variable(tf.eye(int(shape_list[1][0]))) w2 = tf.get_variable("w2", shape=shape_list[1], initializer=initializer) y_2 = tf.concat([y1, tf.tile(tf.ones([1, 1]), [tf.shape(y1)[0], 1])], 1) r = tf.reduce_mean(y_2, 0, keep_dims=True) k = tf.matmul(self.P2, tf.transpose(r)) self.dela_P2 = tf.divide(tf.matmul(k, tf.transpose(k)), self.alpha_array[0][1] + tf.matmul(r, k)) self.P2 = tf.assign_sub(self.P2, self.dela_P2) y2 = tf.matmul(y_2, w2, name="y2") scores = y2 # Calculate mean cross-entropy loss with tf.name_scope("loss"): # losses = tf.square(self.scores - self.input_y) losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=self.input_y) # self.loss = tf.reduce_mean(losses) + 5e-4 * (tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2)) self.loss = tf.reduce_mean(losses) # Accuracy with tf.name_scope("accuracy"): predictions = tf.argmax(scores, 1, name="predictions") correct_predictions = tf.equal(predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float")) self.optimizer = tf.train.MomentumOptimizer(self.lr_array[0][0], momentum=0.9) # self.optimizer = tf.train.GradientDescentOptimizer(self.lr_array[0]) # back_forward grads_and_vars = self.optimizer.compute_gradients(self.loss, var_list=[w1, w2]) for i, (g, v) in enumerate(grads_and_vars): if g is not None: grads_and_vars[i] = (tf.clip_by_norm(g, 10), v) grad_v_input = [self.owm(self.P1, grads_and_vars[0])] grad_v_out = [self.owm(self.P2, grads_and_vars[1])] self.back_forward = self.optimizer.apply_gradients([grad_v_input[0], grad_v_out[0]])
def cnn(x, filter_size, strides, pool_fn, pool_size, pool_strides, act_fn, dtype=tf.float32, add_bias=True, wd=None, init_std=None, init_method=None, batch_norm=True, scope="cnn", trainable=True, is_training=True, keep_ema=False, ext_wts=None): """Builds a convolutional neural networks. Each layer contains the following operations: 1) Convolution, y = w * x. 2) Additive bias (optional), y = w * x + b. 3) Activation function (optional), y = g( w * x + b ). 4) Pooling (optional). Args: x: Input variable. filter_size: Shape of the convolutional filters, list of 4-d int. strides: Convolution strides, list of 4-d int. pool_fn: Pooling functions, list of N callable objects. pool_size: Pooling field size, list of 4-d int. pool_strides: Pooling strides, list of 4-d int. act_fn: Activation functions, list of N callable objects. add_bias: Whether adding bias or not, bool. wd: Weight decay, float. scope: Scope of the model, str. """ num_layer = len(filter_size) # x = tf.Print(x, ['x', tf.reduce_mean(x), tf.reduce_max(x)]) h = x wt_dict = {} with tf.variable_scope(scope): for ii in range(num_layer): with tf.variable_scope("layer_{}".format(ii)): if init_method is not None and init_method[ii]: _init_method = init_method[ii] else: _init_method = "truncated_normal" if ext_wts is not None: w = ext_wts["w_" + str(ii)] if type(w) == np.ndarray: w = tf.constant(w) log.info("Found all weights from numpy array") else: w = weight_variable(filter_size[ii], dtype=dtype, init_method=_init_method, init_param={ "mean": 0.0, "stddev": init_std[ii] }, wd=wd, name="w", trainable=trainable) wt_dict["w_" + str(ii)] = w if add_bias: if ext_wts is not None: b = ext_wts["b_" + str(ii)] if type(b) == np.ndarray: b = tf.constant(b) log.info("Found all biases from numpy array") else: b = weight_variable([filter_size[ii][3]], dtype=dtype, init_method="constant", init_param={"val": 0}, name="b", trainable=trainable) wt_dict["b_" + str(ii)] = b h = tf.nn.conv2d(h, w, strides=strides[ii], padding="SAME", name="conv") if add_bias: h = tf.add(h, b, name="conv_bias") if batch_norm: # Batch normalization. n_out = int(h.get_shape()[-1]) if ext_wts is not None: assign_ema = False beta = ext_wts["beta_" + str(ii)] gamma = ext_wts["gamma_" + str(ii)] emean = ext_wts["emean_" + str(ii)] evar = ext_wts["evar_" + str(ii)] if type(beta) == np.ndarray: beta = tf.constant(ext_wts["beta_" + str(ii)]) gamma = tf.constant(ext_wts["gamma_" + str(ii)]) emean = tf.constant(ext_wts["emean_" + str(ii)]) evar = tf.constant(ext_wts["evar_" + str(ii)]) log.info("Found all BN weights from numpy array") else: assign_ema = True beta = weight_variable([n_out], dtype=dtype, init_method="constant", init_param={"val": 0.0}, name="beta") gamma = weight_variable([n_out], dtype=dtype, init_method="constant", init_param={"val": 1.0}, name="gamma") emean = weight_variable([n_out], dtype=dtype, init_method="constant", init_param={"val": 0.0}, name="ema_mean", trainable=False) evar = weight_variable([n_out], dtype=dtype, init_method="constant", init_param={"val": 1.0}, name="ema_var", trainable=False) wt_dict["beta_" + str(ii)] = beta wt_dict["gamma_" + str(ii)] = gamma wt_dict["emean_" + str(ii)] = emean wt_dict["evar_" + str(ii)] = evar if is_training: decay = 0.9 mean, var = tf.nn.moments(h, [0, 1, 2], name="moments") if assign_ema: # assert False ema_mean_op = tf.assign_sub( emean, (emean - mean) * (1 - decay)) ema_var_op = tf.assign_sub(evar, (evar - var) * (1 - decay)) with tf.control_dependencies( [ema_mean_op, ema_var_op]): h = tf.nn.batch_normalization( h, mean, var, beta, gamma, 1e-5) else: h = (h - emean) / tf.sqrt(evar + 1e-5) * gamma + beta else: h = (h - emean) / tf.sqrt(evar + 1e-5) * gamma + beta if ii == num_layer - 1: assert act_fn[ii] is None if act_fn[ii] is not None: h = act_fn[ii](h, name="act") if pool_fn[ii] is not None: _height = int(h.get_shape()[1]) _width = int(h.get_shape()[2]) h = pool_fn[ii](h, pool_size[ii], strides=pool_strides[ii], padding="VALID", name="pool") _height = int(h.get_shape()[1]) _width = int(h.get_shape()[2]) log.info("After pool {} {}".format(_height, _width)) return h, wt_dict
def testAssignUpdateNoValueShape(self): var = state_ops.variable_op([1, 2], tf.float32) added = tf.assign_add(var, self._NewShapelessTensor()) self.assertEqual([1, 2], added.get_shape()) subbed = tf.assign_sub(var, self._NewShapelessTensor()) self.assertEqual([1, 2], subbed.get_shape())
def batch_norm(x, is_training, gamma=None, beta=None, axes=[0, 1, 2], eps=1e-10, name="bn_out", decay=0.99, dtype=tf.float32): """Applies batch normalization. Collect mean and variances on x except the last dimension. And apply normalization as below: x_ = gamma * (x - mean) / sqrt(var + eps) + beta Args: x: Input tensor, [B, ...]. n_out: Integer, depth of input variable. gamma: Scaling parameter. beta: Bias parameter. axes: Axes to collect statistics. eps: Denominator bias. Returns: normed: Batch-normalized variable. mean: Mean used for normalization (optional). """ n_out = x.get_shape()[-1] try: n_out = int(n_out) shape = [n_out] except: shape = None emean = tf.get_variable("ema_mean", shape=shape, trainable=False, dtype=dtype, initializer=tf.constant_initializer(0.0, dtype=dtype)) evar = tf.get_variable("ema_var", shape=shape, trainable=False, dtype=dtype, initializer=tf.constant_initializer(1.0, dtype=dtype)) if is_training: mean, var = tf.nn.moments(x, axes, name="moments") ema_mean_op = tf.assign_sub(emean, (emean - mean) * (1 - decay)) ema_var_op = tf.assign_sub(evar, (evar - var) * (1 - decay)) normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, eps, name=name) return normed, [ema_mean_op, ema_var_op] else: normed = tf.nn.batch_normalization(x, emean, evar, beta, gamma, eps, name=name) return normed, None
def finite_differences(self, grads_and_vars, global_step, name, d_vars, g_vars, d_grads, g_grads): """ Attempt to directly compute hessian and apply equation (6) """ d_grads = [] g_grads = [] d_vars = [] g_vars = [] beta = 0.5 if self.config.beta is not None: beta = self.config.beta for grad,var in grads_and_vars: if var in self.gan.d_vars(): d_vars += [var] d_grads += [grad] elif var in self.gan.g_vars(): g_vars += [var] g_grads += [grad] else: raise("Couldn't find var in g_vars or d_vars") all_vars = d_vars + g_vars all_grads = d_grads + g_grads with ops.init_scope(): [self._zeros_slot(v, "orig", self._name) for _,v in grads_and_vars] v1 = [self.get_slot(v, "orig") for v in all_vars] restored_vars = all_vars tmp_vars = v1 e1 = 0.0001 e2 = 0.0001 #gamma12 save = tf.group(*[tf.assign(w, v) for w,v in zip(tmp_vars.copy(), restored_vars.copy())]) # store variables def curl(): grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in zip(grads, all_vars)]) with tf.get_default_graph().control_dependencies([op3]): def curlcombine(g1,g2): stepsize = self._lr_t return g1-(g2-g1)/stepsize new_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) g3s = [curlcombine(g1,g2) for g1,g2 in zip(grads,new_grads)] return g3s #gamma12 if self.config.method == 'curl': all_grads = curl() d_grads = all_grads[:len(d_vars)] g_grads = all_grads[len(d_vars):] with tf.get_default_graph().control_dependencies([save]): #opboth = self.optimizer.apply_gradients(grads_and_vars, global_step=global_step, name=name) #opdp = self.optimizer.apply_gradients(grads_and_vars[:len(d_vars)], global_step=global_step, name=name) #opgp = self.optimizer.apply_gradients(grads_and_vars[len(d_vars):], global_step=global_step, name=name) restore = tf.group(*[tf.assign(w, v) for w,v in zip(restored_vars.copy(), tmp_vars.copy())]) # store variables opboth = [tf.assign_sub(w, self._lr_t * v) for w,v in zip(all_vars.copy(), all_grads.copy())] # store variables with tf.get_default_graph().control_dependencies([tf.group(*opboth)]): if self.config.method == "curl": gboth = curl() else: gboth = tf.gradients(self.loss[0], d_vars) + tf.gradients(self.loss[1], g_vars) with tf.get_default_graph().control_dependencies([restore]): opd = opboth[:len(d_vars)] with tf.get_default_graph().control_dependencies([tf.group(*opd)]): if self.config.method == "curl": new_d_grads = curl() else: new_d_grads = tf.gradients(self.loss[0], d_vars) + tf.gradients(self.loss[1], g_vars) with tf.get_default_graph().control_dependencies([restore]): opg = opboth[len(d_vars):] with tf.get_default_graph().control_dependencies([tf.group(*opg)]): if self.config.method == "curl": new_g_grads = curl() else: new_g_grads = tf.gradients(self.loss[0], d_vars) + tf.gradients(self.loss[1], g_vars) with tf.get_default_graph().control_dependencies([restore]): new_grads = [] for _gboth, _gd, _gg, _g in zip(gboth,new_d_grads,new_g_grads,(d_grads+g_grads)): a = (_gg - _g) / self._lr_t # d2f/dx2i b = (_gboth - _gg) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2 c = (_gboth - _gd) / (2*self._lr_t)+(_gg-_g)/(2*self._lr_t) # d2f/dx1dx2 c = -c d = -(_gd - _g) / self._lr_t # d2f/dx2j if self.config.form == 5: a = (_gg - _g) / self._lr_t # d2f/dx2i b = (_gboth - _gg) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2 c = (_gboth - _gd) / (2*self._lr_t)+(_gg-_g)/(2*self._lr_t) # d2f/dx1dx2 d = (_gd - _g) / self._lr_t # d2f/dx2j J = np.array([[a, b], [c,d]]) Jt = np.transpose(J) det = a*d-b*c+1e-8 #h_1 = 1.0/det * (b+d-a-c) h_1_a = d/det h_1_b = -b/det h_1_c = -c/det h_1_d = a/det Jinv = np.array([[h_1_a,h_1_b],[h_1_c,h_1_d]]) _j = Jt[0][0]*Jinv[0][0]*_g+Jt[1][0]*Jinv[1][0]*_g+Jt[0][1]*Jinv[0][1]*_g+Jt[1][1]*Jinv[1][1]*_g alpha = 0.5 if self.config.alpha is not None: alpha = self.config.alpha beta = 0.5 if self.config.beta is not None: beta = self.config.beta new_grads.append( alpha*_g + beta*_j ) new_grads_and_vars = list(zip(new_grads, all_vars)).copy() return self.optimizer.apply_gradients(new_grads_and_vars, global_step=global_step, name=name)
def split_softmax(prelogits, label, num_classes, global_step, weight_decay, gamma=16.0, reuse=None): nrof_features = prelogits.shape[1].value batch_size = tf.shape(prelogits)[0] with tf.variable_scope('SplitSoftmax', reuse=reuse): weights = tf.get_variable('weights', shape=(num_classes, nrof_features), regularizer=slim.l2_regularizer(weight_decay), initializer=slim.xavier_initializer(), # initializer=tf.truncated_normal_initializer(stddev=0.1), # initializer=tf.constant_initializer(0), trainable=True, dtype=tf.float32) alpha = tf.get_variable('alpha', shape=(), regularizer=slim.l2_regularizer(1e-2), initializer=tf.constant_initializer(1.00), trainable=True, dtype=tf.float32) beta = tf.get_variable('beta', shape=(), # regularizer=slim.l2_regularizer(1e-2), initializer=tf.constant_initializer(0.0), trainable=True, dtype=tf.float32) sigma = tf.get_variable('sigma', shape=(), regularizer=slim.l2_regularizer(1e-1), initializer=tf.constant_initializer(1.0), trainable=True, dtype=tf.float32) threshold_pos = tf.get_variable('threshold_pos', shape=(), initializer=tf.constant_initializer(16.0), trainable=False, dtype=tf.float32) threshold_neg = tf.get_variable('threshold_neg', shape=(), initializer=tf.constant_initializer(0.0), trainable=False, dtype=tf.float32) # Normalizing the vecotors weights_normed = tf.nn.l2_normalize(weights, dim=1) prelogits_normed = tf.nn.l2_normalize(prelogits, dim=1) # weights_normed = weights # prelogits_normed = prelogits # Caluculate Centers centers, label_center, center_idx, center_weight = centers_by_label(prelogits_normed, label) centers = tf.gather(centers, center_idx) centers_normed = tf.nn.l2_normalize(centers, dim=1) coef = 1.0 # Label and logits between batch and examplars label_mat_glob = tf.one_hot(label, num_classes, dtype=tf.float32) label_mask_pos_glob = tf.cast(label_mat_glob, tf.bool) label_mask_neg_glob = tf.logical_not(label_mask_pos_glob) # label_exp_batch = tf.expand_dims(label, 1) # label_exp_glob = tf.expand_dims(label_history, 1) # label_mat_glob = tf.equal(label_exp_batch, tf.transpose(label_exp_glob)) # label_mask_pos_glob = tf.cast(label_mat_glob, tf.bool) # label_mask_neg_glob = tf.logical_not(label_mat_glob) # dist_mat_glob = euclidean_distance(prelogits_normed, tf.transpose(weights_normed), False) dist_mat_glob = tf.matmul(prelogits_normed, tf.transpose(weights_normed)) # + beta dist_pos_glob = tf.boolean_mask(dist_mat_glob, label_mask_pos_glob) dist_neg_glob = tf.boolean_mask(dist_mat_glob, label_mask_neg_glob) logits_glob = coef * dist_mat_glob logits_pos_glob = tf.boolean_mask(logits_glob, label_mask_pos_glob) logits_neg_glob = tf.boolean_mask(logits_glob, label_mask_neg_glob) # Label and logits within batch label_exp_batch = tf.expand_dims(label, 1) label_mat_batch = tf.equal(label_exp_batch, tf.transpose(label_exp_batch)) label_mask_pos_batch = tf.cast(label_mat_batch, tf.bool) label_mask_neg_batch = tf.logical_not(label_mask_pos_batch) mask_non_diag = tf.logical_not(tf.cast(tf.eye(batch_size), tf.bool)) label_mask_pos_batch = tf.logical_and(label_mask_pos_batch, mask_non_diag) # dist_mat_batch = euclidean_distance(prelogits_normed, tf.transpose(prelogits_normed), False) dist_mat_batch = tf.matmul(prelogits_normed, tf.transpose(prelogits_normed)) dist_pos_batch = tf.boolean_mask(dist_mat_batch, label_mask_pos_batch) dist_neg_batch = tf.boolean_mask(dist_mat_batch, label_mask_neg_batch) logits_batch = coef * dist_mat_batch logits_pos_batch = tf.boolean_mask(logits_batch, label_mask_pos_batch) logits_neg_batch = tf.boolean_mask(logits_batch, label_mask_neg_batch) # num_anchor = 32 # prelogits_anchor = tf.reshape(prelogits_normed[:num_anchor], [num_anchor, 1, nrof_features]) # prelogits_refer = tf.reshape(prelogits_normed[num_anchor:], [num_anchor, -1, nrof_features]) # dist_anchor = tf.reduce_sum(tf.square(prelogits_anchor-prelogits_refer), axis=2) # dist_anchor = tf.reshape(dist_anchor, [-1]) # logits_anchor = -0.5 * gamma * dist_anchor logits_pos = logits_pos_glob logits_neg = logits_neg_glob dist_pos = dist_pos_glob dist_neg = dist_neg_glob # epsilon_trsd = 0.3 t_pos = coef * (threshold_pos) t_neg = coef * (threshold_neg) if gamma == 'auto': # gamma = tf.nn.softplus(alpha) gamma = tf.log(tf.exp(1.0) + tf.exp(alpha)) elif type(gamma) == tuple: t_min, decay = gamma epsilon = 1e-5 t = t_min + 1.0/(epsilon + decay*tf.cast(global_step, tf.float32)) gamma = 1.0 / t else: assert type(gamma) == float gamma = tf.constant(gamma) hinge_loss = lambda x: tf.nn.relu(1.0 + x) margin_func = hinge_loss # Losses losses = [] # num_pos = tf.cast(0.95 * tf.cast(tf.size(logits_pos), tf.float32), tf.int32) # # num_neg = tf.cast(0.75 * tf.cast(tf.size(logits_neg), tf.float32), tf.int32) # q_d = tf.pow(tf.sqrt(dist_neg), 2-nrof_features)*tf.pow(1-0.25*dist_neg, (3-nrof_features)/2) # tf.add_to_collection('watch_list', ('q_d', tf.reduce_sum(q_d))) # q_d = tf.minimum(1.0, 1 * q_d / tf.reduce_sum(q_d)) # tf.add_to_collection('watch_list', ('q_d', tf.reduce_mean(q_d))) # sample_mask = tf.random_uniform(shape=tf.shape(logits_neg)) <= q_d # sample_mask = logits_neg >= tf.reduce_min(logits_pos) # _logits_neg = tf.boolean_mask(logits_neg, sample_mask) # tf.add_to_collection('watch_list', ('sample_ratio', # tf.cast(tf.size(_logits_neg),tf.float32) / tf.cast(tf.size(logits_neg),tf.float32))) # gamma2 = 1 / 0.01 _logits_pos = tf.reshape(logits_pos, [batch_size, -1]) _logits_neg = tf.reshape(logits_neg, [batch_size, -1]) norm = tf.square(tf.reduce_sum(tf.square(prelogits), axis=1, keep_dims=True)) norm_weights = tf.norm(tf.gather(weights, label), axis=1, keep_dims=True) t_pos = (beta) t_neg = (beta) _logits_pos = _logits_pos * gamma _logits_neg = _logits_neg * gamma # _logits_neg, _ = tf.nn.top_k(_logits_neg, num_neg) # _logits_pos, _ = tf.nn.top_k(_logits_pos, num_pos) # _logits_neg = tf.boolean_mask(_logits_neg, sample_mask) # _logits_pos = -tf.reduce_logsumexp(-_logits_pos)# , axis=1)[:,None] _logits_neg = tf.reduce_logsumexp(_logits_neg, axis=1)[:,None] # _logits_pos = tf.reduce_mean(_logits_pos) #-- Simulate Ranking # se_neg = tf.reduce_sum(tf.exp(_logits_neg)) # min_pos = tf.reduce_min(_logits_pos) # t_pos = tf.stop_gradient(tf.log(se_neg)) # t_neg = tf.stop_gradient(tf.log(se_neg - tf.exp(_logits_neg))) # norm = tf.reshape(prelogits[:,-1], [batch_size, -1]) # norm_weighted = tf.exp(-norm) # norm_weighted = norm / tf.reduce_sum(norm) * tf.cast(tf.size(norm), tf.float32) # sigma_batch = tf.reshape(tf.gather(sigma, label), [batch_size, -1]) m = 5.0 # tf.add_to_collection('watch_list', ('m',m)) factor = 1 / tf.cast(batch_size, tf.float32) bias = tf.log(tf.cast(num_classes, tf.float32)) loss_pos = tf.nn.relu(m + _logits_neg - _logits_pos) * 0.5 loss_neg = tf.nn.relu(m + _logits_neg - _logits_pos) * 0.5 loss = tf.reduce_mean((loss_pos + loss_neg), name='split_loss') losses.extend([loss]) tf.add_to_collection('watch_list', ('split_loss', loss)) # Global loss # weights_batch = tf.gather(weights_normed, label) # _logits_pos_glob = tf.reduce_sum(tf.square(prelogits_normed - weights_batch), axis=1) * coef * gamma _logits_pos_glob = tf.reshape(logits_pos_glob, [batch_size, -1]) * gamma _logits_neg_glob = tf.reshape(logits_neg_glob, [batch_size, -1]) * gamma _logits_neg_glob = tf.reduce_logsumexp(_logits_neg_glob) # , axis=1)[:,None] loss_glob = tf.reduce_mean(tf.nn.relu(1 + _logits_neg_glob - _logits_pos_glob), name='loss_glob') # losses.append(loss_glob) # tf.add_to_collection('watch_list', ('loss_glob', loss_glob)) # Weight decay loss_weight = tf.reduce_sum( 1e-7 * tf.square(weights_normed), name='loss_weight') # losses.append(loss_weight) # tf.add_to_collection('watch_list', ('loss_weight', loss_weight)) # Split Softmax # _logits_pos_glob = tf.reshape(logits_pos_glob, [batch_size, -1]) * gamma # _logits_neg_glob = tf.reshape(logits_neg_glob, [batch_size, -1]) * gamma # _logits_pos_glob = tf.log(tf.reduce_sum(tf.exp(_logits_pos_glob) + num_classes-1, axis=1)[:,None]) # _logits_neg_glob = tf.reduce_logsumexp(_logits_neg_glob, axis=1)[:,None] # _t_pos = t_pos * gamma # _t_neg = t_neg * gamma # loss_pos = tf.reduce_mean(tf.nn.softplus(_t_pos - _logits_pos_glob), name='loss_pos') # loss_neg = tf.reduce_mean(tf.nn.softplus(_logits_neg_glob - _t_neg), name='loss_neg') # losses.extend([loss_pos, loss_neg]) # Batch Center loss # centers_batch = tf.gather(centers, center_idx) centers_batch = tf.gather(weights_normed, label) dist_center = tf.reduce_sum(tf.square(prelogits_normed - centers_batch), axis=1) loss_center = tf.reduce_mean(1.0*dist_center, name='loss_center') # losses.append(loss_center) # tf.add_to_collection('watch_list', ('loss_center', loss_center)) # Update threshold if not threshold_pos in tf.trainable_variables(): # -- Mean threshold mean_pos, var_pos = tf.nn.moments(dist_pos, axes=[0]) mean_neg, var_neg = tf.nn.moments(dist_neg, axes=[0]) std_pos = tf.sqrt(var_pos) std_neg = tf.sqrt(var_neg) threshold_batch = std_neg*mean_pos / (std_pos+std_neg) + std_pos*mean_neg / (std_pos+std_neg) threshold_pos_batch = threshold_neg_batch = threshold_batch # -- Logits # threshold_pos_batch = tf.reduce_logsumexp(_logits_neg) # threshold_neg_batch = -tf.reduce_logsumexp(-_logits_pos) # -- Quantile # diff_pos_sorted, _ = tf.nn.top_k(logits_pos, 2) # diff_neg_sorted, _ = tf.nn.top_k(logits_neg, 2704237) # threshold_pos_batch = diff_neg_sorted[-1] # threshold_neg_batch = diff_pos_sorted[-1] threshold_neg_batch = tf.reduce_min(_logits_pos) threshold_pos_batch = tf.reduce_max(_logits_neg) # -- Update diff_threshold_pos = threshold_pos - threshold_pos_batch diff_threshold_neg = threshold_neg - threshold_neg_batch diff_threshold_pos = 0.1 * diff_threshold_pos diff_threshold_neg = 0.1 * diff_threshold_neg threshold_pos_update_op = tf.assign_sub(threshold_pos, diff_threshold_pos) threshold_neg_update_op = tf.assign_sub(threshold_neg, diff_threshold_neg) threshold_update_op = tf.group(threshold_pos_update_op, threshold_neg_update_op) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, threshold_update_op) # Update centers if not weights in tf.trainable_variables(): weights_batch = tf.gather(weights, label) diff_centers = weights_batch - prelogits unique_label, unique_idx, unique_count = tf.unique_with_counts(label) appear_times = tf.gather(unique_count, unique_idx) appear_times = tf.reshape(appear_times, [-1, 1]) diff_centers = diff_centers / tf.cast((1 + appear_times), tf.float32) diff_centers = 0.5 * diff_centers centers_update_op = tf.scatter_sub(weights, label, diff_centers) # centers_decay_op = tf.assign_sub(weights, 2*weight_decay*weights)# weight decay centers_update_op = tf.group(centers_update_op) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, centers_update_op) # if not sigma in tf.trainable_variables(): # weights_batch = tf.gather(weights, label) # diff_centers = weights_batch - prelogits # _, var_pos = tf.nn.moments(diff_centers, axes=[0]) # sigma_batch = tf.reduce_mean(tf.sqrt(var_pos)) # diff_sigma = sigma - sigma_batch # diff_sigma = 0.01 * diff_sigma # sigma_update_op = tf.assign_sub(sigma, diff_sigma) # tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, sigma_update_op) # Analysis mean_dist_pos = tf.reduce_mean(dist_pos, name='mean_dist_pos') mean_dist_neg = tf.reduce_mean(dist_neg, name='mean_dist_neg') acc_pos = tf.reduce_mean(tf.cast(tf.greater_equal(logits_pos, t_pos), tf.float32), name='acc_pos') acc_neg = tf.reduce_mean(tf.cast(tf.less(logits_neg, t_neg), tf.float32), name='acc_neg') tf.summary.scalar('threshold_pos', threshold_pos) tf.summary.scalar('mean_dist_pos', mean_dist_pos) tf.summary.scalar('mean_dist_neg', mean_dist_neg) tf.summary.scalar('acc_pos', acc_pos) tf.summary.scalar('acc_neg', acc_neg) tf.summary.scalar('gamma', gamma) tf.summary.scalar('alpha', alpha) tf.summary.scalar('beta', beta) tf.summary.histogram('dist_pos', dist_pos) tf.summary.histogram('dist_neg', dist_neg) # tf.summary.histogram('dist_neg_min', _logits_neg / coef) # tf.summary.histogram('sigma', sigma) # tf.add_to_collection('watch_list', ('alpha', alpha)) tf.add_to_collection('watch_list', ('gamma', gamma)) tf.add_to_collection('watch_list', ('alpha', alpha)) tf.add_to_collection('watch_list', ('beta', beta)) # tf.add_to_collection('watch_list', ('t_pos', t_pos)) # tf.add_to_collection('watch_list', ('t_neg', tf.reduce_mean(t_neg))) # tf.add_to_collection('watch_list', ('dpos', mean_dist_pos)) # tf.add_to_collection('watch_list', ('dneg', mean_dist_neg)) # tf.add_to_collection('watch_list', ('loss_pos', loss_pos)) # tf.add_to_collection('watch_list', ('loss_neg', loss_neg)) # tf.add_to_collection('watch_list', ('sigma', sigma)) # tf.add_to_collection('watch_list', ('logits_pos', tf.reduce_mean(_logits_pos))) # tf.add_to_collection('watch_list', ('logits_neg', tf.reduce_mean(_logits_neg))) # tf.add_to_collection('watch_list', ('acc_pos', acc_pos)) # tf.add_to_collection('watch_list', ('acc_neg', acc_neg)) return losses
def __init__(self, data, placeholder, FLAGS): self.optimizer = FLAGS.optimizer self.opti_epsilon = FLAGS.epsilon self.lr = FLAGS.learning_rate self.vocab_size = data.vocab_size self.measure = FLAGS.measure self.embed_dim = FLAGS.embed_dim self.batch_size = FLAGS.batch_size self.rel_size = FLAGS.rel_size self.tuple_model = FLAGS.tuple_model self.init_embedding = FLAGS.init_embedding self.rang = tf.range(0, FLAGS.batch_size, 1) self.temperature = tf.Variable(FLAGS.temperature, trainable=False) self.decay_rate = FLAGS.decay_rate self.log_space = FLAGS.log_space # LSTM Params self.term = FLAGS.term self.hidden_dim = FLAGS.hidden_dim self.peephole = FLAGS.peephole self.freeze_grad = FLAGS.freeze_grad self.regularization_method = FLAGS.regularization_method self.marginal_method = FLAGS.marginal_method self.t1x = placeholder['t1_idx_placeholder'] self.t1mask = placeholder['t1_msk_placeholder'] self.t1length = placeholder['t1_length_placeholder'] self.t2x = placeholder['t2_idx_placeholder'] self.t2mask = placeholder['t2_msk_placeholder'] self.t2length = placeholder['t2_length_placeholder'] self.rel = placeholder['rel_placeholder'] self.relmsk = placeholder['rel_msk_placeholder'] self.label = placeholder['label_placeholder'] """Initiate box embeddings""" self.min_embed, self.delta_embed = self.init_word_embedding(data) self.projector = unit_cube.MinMaxHyperCubeProjectorDeltaParam( self.min_embed, self.delta_embed, 0.0, 1e-10) self.project_op = self.projector.project_op """get unit box representation for both term, no matter they are phrases or words""" self.t1_min_embed, self.t1_max_embed, self.t2_min_embed, self.t2_max_embed = self.get_word_embedding( self.t1x, self.t2x) """get negative example unit box representation, if it's randomly generated during training.""" if FLAGS.neg == 'uniform': neg_num = 1 self.nt1x = tf.random_uniform([self.batch_size * neg_num, 1], 0, self.vocab_size, dtype=tf.int32) self.nt2x = tf.random_uniform([self.batch_size * neg_num, 1], 0, self.vocab_size, dtype=tf.int32) self.nt1_min_embed, self.nt1_max_embed, self.nt2_min_embed, self.nt2_max_embed = self.get_word_embedding( self.nt1x, self.nt2x) # combine the original word embedding with the new embeddings. self.nt1_min_embed = tf.concat( [tf.tile(self.t1_min_embed, [neg_num, 1]), self.nt1_min_embed], axis=0) self.nt1_max_embed = tf.concat( [tf.tile(self.t1_max_embed, [neg_num, 1]), self.nt1_max_embed], axis=0) self.nt2_min_embed = tf.concat( [self.nt2_min_embed, tf.tile(self.t2_min_embed, [neg_num, 1])], axis=0) self.nt2_max_embed = tf.concat( [self.nt2_max_embed, tf.tile(self.t2_max_embed, [neg_num, 1])], axis=0) self.label = tf.concat( [self.label, tf.zeros([self.batch_size * neg_num * 2])], 0) self.t1_uniform_min_embed = tf.concat( [self.t1_min_embed, self.nt1_min_embed], axis=0) self.t1_uniform_max_embed = tf.concat( [self.t1_max_embed, self.nt1_max_embed], axis=0) self.t2_uniform_min_embed = tf.concat( [self.t2_min_embed, self.nt2_min_embed], axis=0) self.t2_uniform_max_embed = tf.concat( [self.t2_max_embed, self.nt2_max_embed], axis=0) conditional_logits, self.meet_min, self.meet_max, self.disjoint, self.nested, self.overlap_volume, self.rhs_volume = self.get_conditional_probability( self.t1_uniform_min_embed, self.t1_uniform_max_embed, self.t2_uniform_min_embed, self.t2_uniform_max_embed) else: conditional_logits, self.meet_min, self.meet_max, self.disjoint, self.nested, self.overlap_volume, self.rhs_volume = self.get_conditional_probability( self.t1_min_embed, self.t1_max_embed, self.t2_min_embed, self.t2_max_embed) evaluation_logits, _, _, _, _, _, _ = self.get_conditional_probability( self.t1_min_embed, self.t1_max_embed, self.t2_min_embed, self.t2_max_embed) self.eval_prob = -evaluation_logits """get conditional probability loss""" # self.cond_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = self.label, logits=conditional_logits)) self.cond_loss = -tf.reduce_mean( tf.multiply(conditional_logits, self.label) + tf.multiply(tf.log(1 - tf.exp(conditional_logits) + 1e-10), 1 - self.label)) self.cond_loss = FLAGS.w1 * self.cond_loss """model marg prob loss""" if FLAGS.w2 > 0.0: if self.log_space: self.max_embed = self.min_embed + tf.exp(self.delta_embed) else: self.max_embed = self.min_embed + self.delta_embed if self.marginal_method == 'universe': self.universe_min = tf.reduce_min(self.min_embed, axis=0, keep_dims=True) self.universe_max = tf.reduce_max(self.max_embed, axis=0, keep_dims=True) self.universe_volume = tf.reduce_prod(tf.nn.softplus( (self.universe_max - self.universe_min) / self.temperature) * self.temperature, axis=-1) self.box_volume = tf.reduce_prod(tf.nn.softplus( (self.max_embed - self.min_embed) / self.temperature) * self.temperature, axis=-1) self.predicted_marginal_logits = tf.log( self.box_volume) - tf.log(self.universe_volume) elif self.marginal_method == 'softplus': self.box_volume = tf.reduce_prod(unit_cube.normalized_softplus( self.delta_embed, self.temperature), axis=-1) self.predicted_marginal_logits = tf.log(self.box_volume) elif self.marginal_method == 'sigmoid': self.box_volume = tf.reduce_prod( unit_cube.sigmoid_normalized_softplus( self.delta_embed, self.temperature), axis=-1) self.predicted_marginal_logits = tf.log(self.box_volume) else: raise ValueError( "Expected either softplus or universe but received", self.marginal_method) self.marginal_probability = tf.constant(data.margina_prob) self.marginal_probability = tf.reshape(self.marginal_probability, [self.vocab_size]) self.marg_loss = FLAGS.w2 * tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=self.marginal_probability, logits=self.predicted_marginal_logits)) else: self.marg_loss = tf.constant(0.0) self.debug = tf.constant(0.0) self.temperature_update = tf.assign_sub(self.temperature, FLAGS.decay_rate) if FLAGS.debug: # """model cond prob loss""" self.pos_disjoint = tf.logical_and(tf.cast(self.label, tf.bool), self.disjoint) self.pos_overlap = tf.logical_and(tf.cast(self.label, tf.bool), tf.logical_not(self.disjoint)) self.neg_disjoint = tf.logical_and( tf.logical_not(tf.cast(self.label, tf.bool)), self.disjoint) self.neg_overlap = tf.logical_and( tf.logical_not(tf.cast(self.label, tf.bool)), tf.logical_not(self.disjoint)) self.pos_nested = tf.logical_and(tf.cast(self.label, tf.bool), self.nested) self.neg_nested = tf.logical_and( tf.logical_not(tf.cast(self.label, tf.bool)), self.nested) self.pos_disjoint.set_shape([None]) self.neg_disjoint.set_shape([None]) self.pos_overlap.set_shape([None]) self.neg_overlap.set_shape([None]) self.pos_nested.set_shape([None]) self.neg_nested.set_shape([None]) if self.marginal_method == 'universe': lhs_volume = tf.reduce_prod(tf.nn.softplus( (self.t2_max_embed - self.t2_min_embed) / self.temperature) * self.temperature, axis=-1) logx = tf.log(rhs_volume) - tf.log(self.universe_volume) logy = tf.log(lhs_volume) - tf.log(self.universe_volume) logxy = tf.log(overlap_volume) - tf.log(self.universe_volume) elif self.marginal_method == 'softplus': logx = tf.log( tf.reduce_prod(unit_cube.normalized_softplus( (self.t1_max_embed - self.t1_min_embed), self.temperature), axis=-1)) logy = tf.log( tf.reduce_prod(unit_cube.normalized_softplus( (self.t2_max_embed - self.t2_min_embed), self.temperature), axis=-1)) logxy = tf.log( tf.reduce_prod(unit_cube.normalized_softplus( (self.meet_max - self.meet_min), self.temperature), axis=-1)) elif self.marginal_method == 'sigmoid': logx = tf.log( tf.reduce_prod(unit_cube.sigmoid_normalized_softplus( (self.t1_max_embed - self.t1_min_embed), self.temperature), axis=-1)) logy = tf.log( tf.reduce_prod(unit_cube.sigmoid_normalized_softplus( (self.t2_max_embed - self.t2_min_embed), self.temperature), axis=-1)) logxy = tf.log( tf.reduce_prod(unit_cube.sigmoid_normalized_softplus( (self.meet_max - self.meet_min), self.temperature), axis=-1)) else: raise ValueError( "Expected either softplus or universe but received", self.marginal_method) lognume1 = logxy lognume2 = logx + logy logdomi = 0.5 * (logx + logy + tf_utils.log1mexp(-logx) + tf_utils.log1mexp(-logy)) correlation = tf.exp(lognume1 - logdomi) - tf.exp(lognume2 - logdomi) self.marg_loss = tf.Print(self.marg_loss, [ tf.exp(self.predicted_marginal_logits), self.marginal_probability, self.box_volume ], 'marginal prediction and label') self.cond_loss = tf.Print(self.cond_loss, [tf.exp(conditional_logits), self.label], 'conditional prediction and label') self.cond_loss = tf.Print(self.cond_loss, [ tf.reduce_sum(tf.cast(self.pos_nested, tf.int32)), tf.boolean_mask(tf.exp(conditional_logits), self.pos_nested) ], 'pos nested number') self.cond_loss = tf.Print(self.cond_loss, [ tf.reduce_sum(tf.cast(self.neg_nested, tf.int32)), tf.boolean_mask(tf.exp(conditional_logits), self.neg_nested) ], 'neg nested number') self.cond_loss = tf.Print(self.cond_loss, [ tf.reduce_mean( tf.boolean_mask(tf.exp(conditional_logits), self.pos_disjoint)), tf.reduce_sum(tf.cast(self.pos_disjoint, tf.int32)), tf.count_nonzero( tf.less_equal( tf.boolean_mask(correlation, self.pos_disjoint), 0)), tf.reduce_mean( tf.boolean_mask(tf.exp(logxy), self.pos_disjoint)), tf.reduce_mean(tf.boolean_mask(tf.exp(logx), self.pos_disjoint)), tf.boolean_mask(self.t2_max_embed, self.pos_disjoint), tf.boolean_mask(self.t2_min_embed, self.pos_disjoint) ], 'pos disjoint loss') self.cond_loss = tf.Print(self.cond_loss, [ tf.reduce_mean( tf.boolean_mask(tf.exp(conditional_logits), self.pos_overlap)), tf.reduce_sum(tf.cast(self.pos_overlap, tf.int32)), tf.count_nonzero( tf.less_equal( tf.boolean_mask(correlation, self.pos_overlap), 0)), tf.reduce_mean(tf.boolean_mask(tf.exp(logxy), self.pos_overlap)), tf.reduce_mean(tf.boolean_mask(tf.exp(logx), self.pos_overlap)) ], 'pos overlap loss') self.cond_loss = tf.Print(self.cond_loss, [ tf.reduce_mean( tf.boolean_mask(tf.exp(conditional_logits), self.neg_disjoint)), tf.reduce_sum(tf.cast(self.neg_disjoint, tf.int32)), tf.count_nonzero( tf.less_equal( tf.boolean_mask(correlation, self.neg_disjoint), 0)), tf.reduce_mean( tf.boolean_mask(tf.exp(logxy), self.neg_disjoint)), tf.reduce_mean(tf.boolean_mask(tf.exp(logx), self.neg_disjoint)) ], 'neg disjoint loss') self.cond_loss = tf.Print(self.cond_loss, [ tf.reduce_mean( tf.boolean_mask(tf.exp(conditional_logits), self.neg_overlap)), tf.reduce_sum(tf.cast(self.neg_overlap, tf.int32)), tf.count_nonzero( tf.less_equal( tf.boolean_mask(correlation, self.neg_overlap), 0)), tf.boolean_mask(self.t1x, self.neg_overlap), tf.boolean_mask(self.t2x, self.neg_overlap), tf.reduce_mean(tf.boolean_mask(tf.exp(logxy), self.neg_overlap)), tf.reduce_mean(tf.boolean_mask(tf.exp(logx), self.neg_overlap)) ], 'neg overlap loss') """model regurlization""" if self.regularization_method == 'universe_edge' and FLAGS.r1 > 0.0: self.regularization = FLAGS.r1 * tf.reduce_mean( tf.nn.softplus(self.universe_max - self.universe_min)) elif self.regularization_method == 'delta' and FLAGS.r1 > 0.0: if self.log_space: self.regularization = FLAGS.r1 * tf.reduce_mean( tf.square(tf.exp(self.delta_embed))) else: self.regularization = FLAGS.r1 * tf.reduce_mean( tf.square(self.delta_embed)) else: self.regularization = tf.constant(0.0) """model final loss""" self.loss = self.cond_loss + self.marg_loss + self.regularization """loss gradient""" grads = tf.gradients(self.loss, tf.trainable_variables()) grad_norm = 0.0 for g in grads: grad_norm += tf.reduce_sum(g.values * g.values) grad_norm = tf.sqrt(grad_norm) self.grad_norm = grad_norm
def assign_moving_mean_variance( mean_var, variance_var, value, decay, name=None): """Compute exponentially weighted moving {mean,variance} of a streaming value. The `value` updated exponentially weighted moving `mean_var` and `variance_var` are given by the following recurrence relations: ```python variance_var = decay * (variance_var + (1-decay) * (value - mean_var)**2) mean_var = decay * mean_var + (1 - decay) * value ``` Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses the lag-1 mean. For derivation justification, see [Finch (2009; Eq. 143)][1]. Args: mean_var: `float`-like `Variable` representing the exponentially weighted moving mean. Same shape as `variance_var` and `value`. variance_var: `float`-like `Variable` representing the exponentially weighted moving variance. Same shape as `mean_var` and `value`. value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`. decay: A `float`-like `Tensor`. The moving mean decay. Typically close to `1.`, e.g., `0.999`. name: Optional name of the returned operation. Returns: mean_var: `Variable` representing the `value`-updated exponentially weighted moving mean. variance_var: `Variable` representing the `value`-updated exponentially weighted moving variance. Raises: TypeError: if `mean_var` does not have float type `dtype`. TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different `base_dtype`. #### References [1]: Tony Finch. Incremental calculation of weighted mean and variance. _Technical Report_, 2009. http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf """ with tf.name_scope(name, "assign_moving_mean_variance", [variance_var, mean_var, value, decay]): with tf.colocate_with(variance_var): with tf.colocate_with(mean_var): base_dtype = mean_var.dtype.base_dtype if not base_dtype.is_floating: raise TypeError( "mean_var.base_dtype({}) does not have float type " "`dtype`.".format(base_dtype.name)) if base_dtype != variance_var.dtype.base_dtype: raise TypeError( "mean_var.base_dtype({}) != variance_var.base_dtype({})".format( base_dtype.name, variance_var.dtype.base_dtype.name)) value = tf.convert_to_tensor(value, dtype=base_dtype, name="value") decay = tf.convert_to_tensor(decay, dtype=base_dtype, name="decay") delta = value - mean_var with tf.control_dependencies([delta]): mean_var = tf.assign_add(mean_var, (1. - decay) * delta) variance_var = tf.assign_sub( variance_var, (1. - decay) * (variance_var - decay * tf.square(delta))) return mean_var, variance_var
def apply_updates(self, allow_no_op: bool = False) -> tf.Operation: """Construct training op to update the registered variables based on their gradients.""" tfutil.assert_tf_initialized() assert not self._updates_applied self._updates_applied = True all_ops = [] # Check for no-op. if allow_no_op and len(self._devices) == 0: with tfutil.absolute_name_scope(self.scope): return tf.no_op(name='TrainingOp') # Clean up gradients. for device_idx, device in enumerate(self._devices.values()): with tfutil.absolute_name_scope(self.scope + "/Clean%d" % device_idx), tf.device( device.name): for var, grad in device.grad_raw.items(): # Filter out disconnected gradients and convert to float32. grad = [g for g in grad if g is not None] grad = [tf.cast(g, tf.float32) for g in grad] # Sum within the device. if len(grad) == 0: grad = tf.zeros(var.shape) # No gradients => zero. elif len(grad) == 1: grad = grad[0] # Single gradient => use as is. else: grad = tf.add_n(grad) # Multiple gradients => sum. # Scale as needed. scale = 1.0 / len(device.grad_raw[var]) / len( self._devices) scale = tf.constant(scale, dtype=tf.float32, name="scale") if self.minibatch_multiplier is not None: scale /= tf.cast(self.minibatch_multiplier, tf.float32) scale = self.undo_loss_scaling(scale) device.grad_clean[var] = grad * scale # Sum gradients across devices. if len(self._devices) > 1: with tfutil.absolute_name_scope(self.scope + "/Broadcast"), tf.device(None): if platform.system( ) == "Windows": # Windows => NCCL ops are not available. self._broadcast_fallback() elif tf.VERSION.startswith( "1.15." ): # TF 1.15 => NCCL ops are broken: https://github.com/tensorflow/tensorflow/issues/41539 self._broadcast_fallback() else: # Otherwise => NCCL ops are safe to use. self._broadcast_nccl() # Apply updates separately on each device. for device_idx, device in enumerate(self._devices.values()): with tfutil.absolute_name_scope(self.scope + "/Apply%d" % device_idx), tf.device( device.name): # pylint: disable=cell-var-from-loop # Accumulate gradients over time. if self.minibatch_multiplier is None: acc_ok = tf.constant(True, name='acc_ok') device.grad_acc = OrderedDict(device.grad_clean) else: # Create variables. with tf.control_dependencies(None): for var in device.grad_clean.keys(): device.grad_acc_vars[var] = tf.Variable( tf.zeros(var.shape), trainable=False, name="grad_acc_var") device.grad_acc_count = tf.Variable( tf.zeros([]), trainable=False, name="grad_acc_count") # Track counter. count_cur = device.grad_acc_count + 1.0 count_inc_op = lambda: tf.assign(device.grad_acc_count, count_cur) count_reset_op = lambda: tf.assign(device.grad_acc_count, tf.zeros([])) acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier, tf.float32)) all_ops.append( tf.cond(acc_ok, count_reset_op, count_inc_op)) # Track gradients. for var, grad in device.grad_clean.items(): acc_var = device.grad_acc_vars[var] acc_cur = acc_var + grad device.grad_acc[var] = acc_cur with tf.control_dependencies([acc_cur]): acc_inc_op = lambda: tf.assign(acc_var, acc_cur) acc_reset_op = lambda: tf.assign( acc_var, tf.zeros(var.shape)) all_ops.append( tf.cond(acc_ok, acc_reset_op, acc_inc_op)) # No overflow => apply gradients. all_ok = tf.reduce_all( tf.stack([acc_ok] + [ tf.reduce_all(tf.is_finite(g)) for g in device.grad_acc.values() ])) apply_op = lambda: device.optimizer.apply_gradients( [(tf.cast(grad, var.dtype), var) for var, grad in device.grad_acc.items()]) all_ops.append(tf.cond(all_ok, apply_op, tf.no_op)) # Adjust loss scaling. if self.use_loss_scaling: ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var, self.loss_scaling_inc) ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var, self.loss_scaling_dec) ls_update_op = lambda: tf.group( tf.cond(all_ok, ls_inc_op, ls_dec_op)) all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op)) # Last device => report statistics. if device_idx == len(self._devices) - 1: all_ops.append( autosummary.autosummary( self.id + "/learning_rate", tf.convert_to_tensor(self.learning_rate))) all_ops.append( autosummary.autosummary(self.id + "/overflow_frequency", tf.where(all_ok, 0, 1), condition=acc_ok)) if self.use_loss_scaling: all_ops.append( autosummary.autosummary( self.id + "/loss_scaling_log2", device.loss_scaling_var)) # Initialize variables. self.reset_optimizer_state() if self.use_loss_scaling: tfutil.init_uninitialized_vars( [device.loss_scaling_var for device in self._devices.values()]) if self.minibatch_multiplier is not None: tfutil.run([ var.initializer for device in self._devices.values() for var in list(device.grad_acc_vars.values()) + [device.grad_acc_count] ]) # Group everything into a single op. with tfutil.absolute_name_scope(self.scope): return tf.group(*all_ops, name="TrainingOp")
def testAssignUpdateNoVarShape(self): var = state_ops.variable_op([1, 2], tf.float32, set_shape=False) added = tf.assign_add(var, [[2.0, 3.0]]) self.assertEqual([1, 2], added.get_shape()) subbed = tf.assign_sub(var, [[12.0, 13.0]]) self.assertEqual([1, 2], subbed.get_shape())
def build_trainer(self, child_model): # actor child_model.build_valid_rl() self.valid_acc = (tf.to_float(child_model.valid_shuffle_acc) / tf.to_float(child_model.batch_size)) self.reward = self.valid_acc if self.use_critic: # critic all_h = tf.concat(self.all_h, axis=0) value_function = tf.matmul(all_h, self.w_critic) advantage = value_function - self.reward critic_loss = tf.reduce_sum(advantage**2) self.baseline = tf.reduce_mean(value_function) self.loss = -tf.reduce_mean(self.sample_log_probs * advantage) critic_train_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="critic_train_step") critic_train_op, _, _, _ = get_train_ops(critic_loss, [self.w_critic], critic_train_step, clip_mode=None, lr_init=1e-3, lr_dec_start=0, lr_dec_every=int(1e9), optim_algo="adam", sync_replicas=False) else: # or baseline self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) baseline_update = tf.assign_sub(self.baseline, (1 - self.bl_dec) * (self.baseline - self.reward)) with tf.control_dependencies([baseline_update]): self.reward = tf.identity(self.reward) self.loss = self.sample_log_probs * (self.reward - self.baseline) self.train_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="train_step") tf_variables = [ var for var in tf.trainable_variables() if var.name.startswith(self.name) and "w_critic" not in var.name ] print "-" * 80 for var in tf_variables: print var self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops( self.loss, tf_variables, self.train_step, clip_mode=self.clip_mode, grad_bound=self.grad_bound, l2_reg=self.l2_reg, lr_init=self.lr_init, lr_dec_start=self.lr_dec_start, lr_dec_every=self.lr_dec_every, lr_dec_rate=self.lr_dec_rate, optim_algo=self.optim_algo, sync_replicas=self.sync_replicas, num_aggregate=self.num_aggregate, num_replicas=self.num_replicas) if self.use_critic: self.train_op = tf.group(self.train_op, critic_train_op)
def optimize(loss, global_step, max_grad_norm, lr, lr_decay, sync_replicas=False, replicas_to_aggregate=1, task_id=0): """Builds optimization graph. * Creates an optimizer, and optionally wraps with SyncReplicasOptimizer * Computes, clips, and applies gradients * Maintains moving averages for all trainable variables * Summarizes variables and gradients Args: loss: scalar loss to minimize. global_step: integer scalar Variable. max_grad_norm: float scalar. Grads will be clipped to this value. lr: float scalar, learning rate. lr_decay: float scalar, learning rate decay rate. sync_replicas: bool, whether to use SyncReplicasOptimizer. replicas_to_aggregate: int, number of replicas to aggregate when using SyncReplicasOptimizer. task_id: int, id of the current task; used to ensure proper initialization of SyncReplicasOptimizer. Returns: train_op """ with tf.name_scope('optimization'): # Compute gradients. tvars = tf.trainable_variables() grads = tf.gradients( loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) # Clip non-embedding grads non_embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) if 'embedding' not in v.op.name] embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) if 'embedding' in v.op.name] ne_grads, ne_vars = zip(*non_embedding_grads_and_vars) ne_grads, _ = tf.clip_by_global_norm(ne_grads, max_grad_norm) non_embedding_grads_and_vars = list(zip(ne_grads, ne_vars)) grads_and_vars = embedding_grads_and_vars + non_embedding_grads_and_vars if not global_step: opt = tf.train.AdamOptimizer(lr) apply_gradient_op = opt.apply_gradients(grads_and_vars) return apply_gradient_op # Summarize _summarize_vars_and_grads(grads_and_vars) # Decaying learning rate lr = tf.train.exponential_decay(lr, global_step, 1, lr_decay, staircase=True) tf.summary.scalar('learning_rate', lr) opt = tf.train.AdamOptimizer(lr) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( 0.999, global_step) global_step = tf.assign_sub(global_step, 1) # Apply gradients if sync_replicas: opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=tvars, total_num_replicas=replicas_to_aggregate) apply_gradient_op = opt.apply_gradients(grads_and_vars) with tf.control_dependencies([apply_gradient_op]): train_op = tf.no_op(name='train_op') # Initialization ops tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, opt.get_chief_queue_runner()) if task_id == 0: local_init_op = opt.chief_init_op tf.add_to_collection('chief_init_op', opt.get_init_tokens_op()) else: local_init_op = opt.local_step_init_op tf.add_to_collection('local_init_op', local_init_op) tf.add_to_collection('ready_for_local_init_op', opt.ready_for_local_init_op) else: # Non-sync optimizer apply_gradient_op = opt.apply_gradients(grads_and_vars) with tf.control_dependencies([apply_gradient_op]): train_op = variable_averages.apply(tvars) return train_op
def finite_differences(self, grads_and_vars, global_step, name, d_vars, g_vars, d_grads, g_grads): all_vars = [ v for _,v in grads_and_vars] all_grads = [ g for g, _ in grads_and_vars ] d_grads = all_grads[:len(d_vars)] g_grads = all_grads[len(d_vars):] d_vars = [] g_vars = [] for grad,var in grads_and_vars: if var in self.gan.d_vars(): d_vars += [var] elif var in self.gan.g_vars(): g_vars += [var] else: raise("Couldn't find var in g_vars or d_vars") with ops.init_scope(): [self._zeros_slot(v, "orig", self._name) for _,v in grads_and_vars] slots_list = [] if self.config.include_slots: for name in self.optimizer.get_slot_names(): for var in self.optimizer.variables(): slots_list.append(self.optimizer._zeros_slot(var, "orig", "orig")) v1 = [self.get_slot(v, "orig") for _,v in grads_and_vars] slots_list = [] slots_vars = [] restored_vars = all_vars + slots_vars tmp_vars = v1 + slots_list e1 = 0.0001 e2 = 0.0001 #gamma12 save = tf.group(*[tf.assign(w, v) for w,v in zip(tmp_vars, restored_vars)]) # store variables restore = tf.group(*[tf.assign(w, v) for w,v in zip(restored_vars, tmp_vars)]) # store variables def curl(): grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in zip(grads, all_vars)]) with tf.get_default_graph().control_dependencies([op3]): def curlcombine(g1,g2): stepsize = self._lr_t return g1-(g2-g1)/stepsize new_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) g3s = [curlcombine(g1,g2) for g1,g2 in zip(grads,new_grads)] return g3s #gamma12 with tf.get_default_graph().control_dependencies([save]): #opboth = self.optimizer.apply_gradients(grads_and_vars, global_step=global_step, name=name) #opdp = self.optimizer.apply_gradients(grads_and_vars[:len(d_vars)], global_step=global_step, name=name) #opgp = self.optimizer.apply_gradients(grads_and_vars[len(d_vars):], global_step=global_step, name=name) opboth = tf.group(*[tf.assign_sub(w, self._lr_t * v) for w,v in zip(all_vars, all_grads)]) # store variables opd = tf.group(*[tf.assign_sub(w, self._lr_t * v) for w,v in zip(d_vars, d_grads)]) # store variables opg = tf.group(*[tf.assign_sub(w, self._lr_t * v) for w,v in zip(g_vars, g_grads)]) # store variables with tf.get_default_graph().control_dependencies([opboth]): gboth = curl()#tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) with tf.get_default_graph().control_dependencies([restore]): with tf.get_default_graph().control_dependencies([opd]): #new_d_grads = [tf.zeros_like(_d) for _d in d_vars]+tf.gradients(self.gan.trainer.g_loss, g_vars) new_d_grads = curl()#tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) with tf.get_default_graph().control_dependencies([restore]): with tf.get_default_graph().control_dependencies([opg]): #new_g_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + [tf.zeros_like(_g) for _g in g_vars] new_g_grads = curl()#tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars) with tf.get_default_graph().control_dependencies([restore]): new_grads = [] for _gboth, _gd, _gg, _g in zip(gboth,new_d_grads,new_g_grads,d_grads): det = tf.square(_gboth)-(_gg*_gd)+1e-8 h_1 = 1.0/det * (2*_gboth - _gd - _gg) if self.config.hessian: #v = (g(x + hjej)-g(x)))/(2hj) + \ # (g(x + hiei)-g(x))/(2hi) a = (_gboth - _g) / self._lr_t # d2f/dx2i c = (_gboth - _g) / self._lr_t # d2f/dx2j b = (_gg - _g) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2 d = b # d2f/dx2dx1 det = a*d-b*c+1e-8 #h_1 = 1.0/det * (b+d-a-c) h_1_a = d/det h_1_b = -b/det h_1_c = -c/det h_1_d = a/det h_1 = h_1_a*h_1_d-h_1_b*h_1_c new_grads.append( _g*h_1 ) for _gboth, _gd, _gg, _g in zip(gboth[len(d_vars):],new_d_grads[len(d_vars):],new_g_grads[len(d_vars):],g_grads): det = tf.square(_gboth)-(_gg*_gd)+1e-8 h_1 = 1.0/det * (2*_gboth - _gd - _gg) if self.config.hessian: #v = (g(x + hjej)-g(x)))/(2hj) + \ # (g(x + hiei)-g(x))/(2hi) a = (_gboth - _g) / self._lr_t # d2f/dx2i c = (_gboth - _g) / self._lr_t # d2f/dx2j b = (_gg - _g) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2 d = b # d2f/dx2dx1 det = a*d-b*c+1e-8 #h_1 = 1.0/det * (b+d-a-c) h_1_a = d/det h_1_b = -b/det h_1_c = -c/det h_1_d = a/det h_1 = h_1_a*h_1_d-h_1_b*h_1_c new_grads.append( _g*h_1 ) new_grads_and_vars = list(zip(new_grads, all_vars)).copy() return self.optimizer.apply_gradients(new_grads_and_vars, global_step=global_step, name=name)
# relu Relu1 = tf.nn.relu(X) # 1,1,1 Relu0 = tf.nn.relu(P) r_add = rtt.SecureReveal(Add) r_sub = rtt.SecureReveal(Sub) r_mul = rtt.SecureReveal(Mul) r_matmul = rtt.SecureReveal(Matmul) r_bias_add = rtt.SecureReveal(BiasAdd) r_relu1 = rtt.SecureReveal(Relu1) r_relu0 = rtt.SecureReveal(Relu0) r_AB3 = rtt.SecureReveal(AB3) r_AB4 = rtt.SecureReveal(AB4) r_AB5 = rtt.SecureReveal(AB5) r_assign_sub = rtt.SecureReveal(tf.assign_sub(Y, X)) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) print("add reveal: ", sess.run(r_add)) print("add_multiple_demension reveal ab3(add): ", sess.run(r_AB3)) print("add_multiple_demension reveal ab4(add): ", sess.run(r_AB4)) print("add_multiple_demension reveal ab5(add): ", sess.run(r_AB5)) print("sub reveal: ", sess.run(r_sub)) print("mul reveal: ", sess.run(r_mul)) print("matmul reveal: ", sess.run(r_matmul)) print("bias_add reveal: ", sess.run(r_bias_add)) print("relu(expect-0) reveal: ", sess.run(r_relu0)) print("relu(expect-1) reveal: ", sess.run(r_relu1)) print("assign_sub(expect-1) reveal: ", sess.run(r_assign_sub))
def tf_store(self, states, internals, actions, terminal, reward): # Memory indices to overwrite. num_instances = tf.shape(input=terminal)[0] with tf.control_dependencies( [tf.assert_less_equal(num_instances, self.capacity)]): indices = tf.range(self.memory_index, self.memory_index + num_instances) % self.capacity # Remove episode indices. num_episodes = tf.count_nonzero(input_tensor=tf.gather( params=self.terminal_memory, indices=indices), axis=0, dtype=util.tf_dtype('int')) num_episodes = tf.minimum(x=num_episodes, y=self.episode_count) assignment = tf.assign( ref=self.episode_indices[:self.episode_count - num_episodes], value=self.episode_indices[num_episodes:self.episode_count]) # Decrement episode count. with tf.control_dependencies(control_inputs=(assignment, )): assignment = tf.assign_sub(ref=self.episode_count, value=num_episodes) # Assign new observations. with tf.control_dependencies(control_inputs=(assignment, )): assignments = list() for name in sorted(states): assignments.append( tf.scatter_update(ref=self.states_memory[name], indices=indices, updates=states[name])) for name in sorted(internals): assignments.append( tf.scatter_update(ref=self.internals_memory[name], indices=indices, updates=internals[name])) for name in sorted(actions): assignments.append( tf.scatter_update(ref=self.actions_memory[name], indices=indices, updates=actions[name])) assignments.append( tf.scatter_update(ref=self.terminal_memory, indices=indices, updates=terminal)) assignments.append( tf.scatter_update(ref=self.reward_memory, indices=indices, updates=reward)) # Add episode indices. with tf.control_dependencies(control_inputs=assignments): num_episodes = tf.count_nonzero(input_tensor=terminal, axis=0, dtype=util.tf_dtype('int')) assignment = tf.assign( ref=self.episode_indices[self. episode_count:self.episode_count + num_episodes], value=tf.boolean_mask(tensor=indices, mask=terminal)) # Increment episode count. with tf.control_dependencies(control_inputs=(assignment, )): assignment = tf.assign_add(ref=self.episode_count, value=num_episodes) # Increment memory index. with tf.control_dependencies(control_inputs=(assignment, )): assignment = tf.assign( ref=self.episode_indices[-1], value=tf.where( self.memory_index + num_instances > self.capacity, self.episode_indices[self.episode_count - 1], self.capacity - 1)) with tf.control_dependencies(control_inputs=(assignment, )): assignment = tf.assign(ref=self.memory_index, value=((self.memory_index + num_instances) % self.capacity)) with tf.control_dependencies(control_inputs=(assignment, )): return tf.no_op()