def _build_train_op(self): """Builds the training op for Rainbow. Returns: train_op: An op performing one step of training. """ target_distribution = tf.stop_gradient(self._build_target_distribution()) # size of indices: batch_size x 1. indices = tf.range(tf.shape(self._replay_logits)[0])[:, None] # size of reshaped_actions: batch_size x 2. reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1) # For each element of the batch, fetch the logits for its selected action. chosen_action_logits = tf.gather_nd(self._replay_logits, reshaped_actions) loss = tf.nn.softmax_cross_entropy_with_logits( labels=target_distribution, logits=chosen_action_logits) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, epsilon=self.optimizer_epsilon) update_priorities_op = self._replay.tf_set_priority( self._replay.indices, tf.sqrt(loss + 1e-10)) target_priorities = self._replay.tf_get_priority(self._replay.indices) target_priorities = tf.math.add(target_priorities, 1e-10) target_priorities = 1.0 / tf.sqrt(target_priorities) target_priorities /= tf.reduce_max(target_priorities) weighted_loss = target_priorities * loss with tf.control_dependencies([update_priorities_op]): return optimizer.minimize(tf.reduce_mean(weighted_loss)), weighted_loss
def _build_train_op(self): """Builds the training op for Rainbow. Returns: train_op: An op performing one step of training. """ replay_action_one_hot = tf.one_hot(self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') replay_chosen_q = tf.reduce_sum(self._replay_qs * replay_action_one_hot, reduction_indices=1, name='replay_chosen_q') target = tf.stop_gradient(self._build_target_q_op()) loss = tf.losses.huber_loss(target, replay_chosen_q, reduction=tf.losses.Reduction.NONE) update_priorities_op = self._replay.tf_set_priority( self._replay.indices, tf.sqrt(loss + 1e-10)) target_priorities = self._replay.tf_get_priority(self._replay.indices) target_priorities = tf.math.add(target_priorities, 1e-10) target_priorities = 1.0 / tf.sqrt(target_priorities) target_priorities /= tf.reduce_max(target_priorities) weighted_loss = target_priorities * loss with tf.control_dependencies([update_priorities_op]): return self.optimizer.minimize( tf.reduce_mean(weighted_loss)), weighted_loss
def historgram_loss(y, y_hat, k=100., sigma=1 / 2): raise NotImplementedError() ps = 0. w = 1 / k y = tf.squeeze(y, axis=2) # y_hat = tf.layers.flatten(y_hat) k = np.linspace(0., 1., k) s = (tf.erf((1. - y) / (tf.sqrt(2.) * sigma)) - tf.erf((0. - y) / (tf.sqrt(2.) * sigma))) for idx, j in enumerate(k): u = tf.erf(((j + w - y) / (tf.sqrt(2.) * sigma))) l = tf.erf(((j - y) / (tf.sqrt(2.) * sigma))) p = (u - l) / (2 * s + 1e-6) f_x = tf.log(y_hat[:, :, idx]) ps += p * tf.where(tf.is_nan(f_x), tf.zeros_like(f_x), f_x) return tf.reduce_mean(-ps)
def batched_euclidean_distance(y_hat, y, squared=True): assert y_hat.get_shape().ndims == 3 and y.get_shape().ndims == 3 a = tf.square(tf.reduce_sum(y, axis=2))[:, :, None] b = tf.square(tf.reduce_sum(y_hat, axis=2))[:, None, :] D = tf.matmul(y, y_hat, transpose_b=True) d = a + b - 2 * D return tf.sqrt(d) if not squared else d
def regr_metrics(y, y_hat): regr_ops = { 'mse': mse(y, y_hat), 'mae': mae(y, y_hat), 'smape': smape(y, y_hat), 'rmse': tf.sqrt(mse(y, y_hat)) } return regr_ops
def _reg(cls, batch_size, d, x, x_fake, beta=1e-1): alpha = tf.random_uniform(shape=[batch_size, 1], minval=0., maxval=1.) interpolates = alpha * x + (1 - alpha) * x_fake int_d = d(interpolates) gradients = tf.gradients(int_d, [interpolates])[0] slopes = tf.sqrt( tf.reduce_sum(tf.square(gradients), reduction_indices=[1])) return beta * tf.reduce_mean((slopes - 1)**2)
def update(i, grad, state): i = tf.cast(i, dtype=tf.float32) x, m, v = state m = (1. - b1) * grad + b1 * m # First moment estimate. v = (1. - b2) * (grad**2.) + b2 * v # Second moment estimate. mhat = m / (1. - b1**(i + 1.)) # Bias correction. vhat = v / (1. - b2**(i + 1.)) x = x - learning_rate * mhat / (tf.sqrt(vhat) + eps) return x, m, v
def normalized_dist(states): inner = tf.multiply(states - starting_states, goals - starting_states) upper = tf.reduce_sum(inner, -1) sign = tf.sign(upper) result = sign * tf.square(tf.math.divide(upper, tf.norm(goals - starting_states, ord=2))) term_1 = tf.square(tf.norm(states - starting_states, 2)) term_2 = tf.square(tf.math.divide(upper, tf.norm(goals - starting_states, ord=2))) return tf.sqrt(epsilon + tf.abs(result - alpha * (term_1 - term_2)))
def _variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var)
def summarize_stats(stats): """Summarize a dictionary of variables. Args: stats: a dictionary of {name: tensor} to compute stats over. """ for name, stat in stats.items(): mean = tf.reduce_mean(stat) tf.summary.scalar('mean_%s' % name, mean) tf.summary.scalar('max_%s' % name, tf.reduce_max(stat)) tf.summary.scalar('min_%s' % name, tf.reduce_min(stat)) std = tf.sqrt(tf.reduce_mean(tf.square(stat)) - tf.square(mean) + 1e-10) tf.summary.scalar('std_%s' % name, std) tf.summary.histogram(name, stat)
def weight_standardization_replacements(model): """Weight-standardize non-output kernels of `model`.""" if not isinstance(model, ReparameterizableBackbone): raise ValueError( '`model` must be an instance of `ReparameterizableBackbone`.') kernels = filter(lambda v: 'kernel' in v.name and 'output' not in v.name, model.reparameterizables()) replacements = [] for v in kernels: # Wrap a standardization around the kernel. # Kernel has shape HWIO, normalize over HWI mean, var = tf.nn.moments(v, axes=[0, 1, 2], keepdims=True) # Author code uses std + 1e-5 replacements.append((v.ref(), (v - mean) / tf.sqrt(var + 1e-10))) return dict(replacements)
def _do_data_dependent_init(): """Returns ops for the data-dependent init of g and maybe b_fc.""" w_fc_normalized = tf.nn.l2_normalize(w_fc.read_value(), [0]) output_init = tf.matmul(embeddings, w_fc_normalized) mean_init, var_init = tf.nn.moments(output_init, [0]) # Data-dependent init values. g_init_value = 1. / tf.sqrt(var_init + 1e-10) ops = [tf.assign(g, g_init_value)] if not cosine_classifier: # Also initialize a bias in a data-dependent way. b_fc_init_value = -mean_init * g_init_value ops.append(tf.assign(b_fc, b_fc_init_value)) # Mark that the data-dependent initialization is done to prevent it from # happening again in the future. ops.append(tf.assign(data_dependent_init_done, 1)) return tf.group(*ops)
def sample(self, mean, log_b2, training=False): """sample Sampling z from Z ~ Laplacian(μ,b) Y ~ N(0,1) V ~ Exponential(1) = Gamma(1,1) z = μ + by(2v)^1/2 """ if not training: return mean # Exponential is special case of Gamma # Exponential(λ) = Gamma(1,λ) exponential = tf.random.gamma(tf.shape(mean), alpha=1, beta=1) gaussian = tf.random.normal(tf.shape(mean), mean=0.0, stddev=1.0) return mean + tf.exp(0.5*log_b2)*tf.sqrt(2*exponential)*gaussian
def binary_indicator(states, actions, rewards, next_states, contexts, termination_epsilon=1e-4, offset=0, epsilon=1e-10, state_indices=None, summarize=False): """Returns 0/1 by checking if next_states and contexts overlap. Args: states: A [batch_size, num_state_dims] Tensor representing a batch of states. actions: A [batch_size, num_action_dims] Tensor representing a batch of actions. rewards: A [batch_size] Tensor representing a batch of rewards. next_states: A [batch_size, num_state_dims] Tensor representing a batch of next states. contexts: A list of [batch_size, num_context_dims] Tensor representing a batch of contexts. termination_epsilon: terminate if dist is less than this quantity. offset: Offset the rewards. epsilon: small offset to ensure non-negative/zero distance. Returns: A new tf.float32 [batch_size] rewards Tensor, and tf.float32 [batch_size] discounts tensor. """ del states, actions # unused args next_states = index_states(next_states, state_indices) dist = tf.reduce_sum(tf.squared_difference(next_states, contexts[0]), -1) dist = tf.sqrt(dist + epsilon) discounts = dist > termination_epsilon rewards = tf.logical_not(discounts) rewards = tf.to_float(rewards) + offset return tf.to_float(rewards), tf.ones_like(tf.to_float(discounts)) #tf.to_float(discounts)
def _normalize_advantages(advantages, axes=(0, ), variance_epsilon=1e-8): adv_mean, adv_var = tf.nn.moments(x=advantages, axes=axes, keepdims=True) normalized_advantages = ((advantages - adv_mean) / (tf.sqrt(adv_var) + variance_epsilon)) return normalized_advantages
def diff_distance(states, actions, rewards, next_states, contexts, state_scales=1.0, goal_scales=1.0, reward_scales=1.0, weight_index=None, weight_vector=None, summarize=False, termination_epsilon=1e-4, state_indices=None, goal_indices=None, norm='L2', epsilon=1e-10): """Returns the difference in euclidean distance between states/next_states and contexts. Args: states: A [batch_size, num_state_dims] Tensor representing a batch of states. actions: A [batch_size, num_action_dims] Tensor representing a batch of actions. rewards: A [batch_size] Tensor representing a batch of rewards. next_states: A [batch_size, num_state_dims] Tensor representing a batch of next states. contexts: A list of [batch_size, num_context_dims] Tensor representing a batch of contexts. state_scales: multiplicative scale for (next) states. A scalar or 1D tensor, must be broadcastable to number of state dimensions. goal_scales: multiplicative scale for goals. A scalar or 1D tensor, must be broadcastable to number of goal dimensions. reward_scales: multiplicative scale for rewards. A scalar or 1D tensor, must be broadcastable to number of reward dimensions. weight_index: (integer) The context list index that specifies weight. weight_vector: (a number or a list or Numpy array) The weighting vector, broadcastable to `next_states`. summarize: (boolean) enable summary ops. termination_epsilon: terminate if dist is less than this quantity. state_indices: (a list of integers) list of state indices to select. goal_indices: (a list of integers) list of goal indices to select. vectorize: Return a vectorized form. norm: L1 or L2. epsilon: small offset to ensure non-negative/zero distance. Returns: A new tf.float32 [batch_size] rewards Tensor, and tf.float32 [batch_size] discounts tensor. """ del actions, rewards # Unused stats = {} record_tensor(next_states, state_indices, stats, 'next_states') next_states = index_states(next_states, state_indices) states = index_states(states, state_indices) goals = index_states(contexts[0], goal_indices) next_sq_dists = tf.squared_difference(next_states * state_scales, goals * goal_scales) sq_dists = tf.squared_difference(states * state_scales, goals * goal_scales) record_tensor(sq_dists, None, stats, 'sq_dists') if weight_vector is not None: next_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype) sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype) if weight_index is not None: next_sq_dists *= contexts[weight_index] sq_dists *= contexts[weight_index] if norm == 'L1': next_dist = tf.sqrt(next_sq_dists + epsilon) dist = tf.sqrt(sq_dists + epsilon) next_dist = tf.reduce_sum(next_dist, -1) dist = tf.reduce_sum(dist, -1) elif norm == 'L2': next_dist = tf.reduce_sum(next_sq_dists, -1) next_dist = tf.sqrt(next_dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0) dist = tf.reduce_sum(sq_dists, -1) dist = tf.sqrt(dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0) else: raise NotImplementedError(norm) discounts = next_dist > termination_epsilon if summarize: with tf.name_scope('RewardFn/'): tf.summary.scalar('mean_dist', tf.reduce_mean(dist)) tf.summary.histogram('dist', dist) summarize_stats(stats) diff = dist - next_dist diff *= reward_scales return tf.to_float(diff), tf.to_float(discounts)
def projection_distance(states, starting_states, actions, rewards, next_states, contexts, alpha = 0, state_scales=1.0, goal_scales=1.0, reward_scales=1.0, weight_index=None, weight_vector=None, summarize=False, termination_epsilon=1e-4, state_indices=None, goal_indices=None, vectorize=False, relative_context=False, diff=False, norm='L2', epsilon=1e-10, bonus_epsilon=0., #5., offset=0.0): """Returns the negative euclidean distance between next_states and contexts. Args: states: A [batch_size, num_state_dims] Tensor representing a batch of states. actions: A [batch_size, num_action_dims] Tensor representing a batch of actions. rewards: A [batch_size] Tensor representing a batch of rewards. next_states: A [batch_size, num_state_dims] Tensor representing a batch of next states. contexts: A list of [batch_size, num_context_dims] Tensor representing a batch of contexts. state_scales: multiplicative scale for (next) states. A scalar or 1D tensor, must be broadcastable to number of state dimensions. goal_scales: multiplicative scale for goals. A scalar or 1D tensor, must be broadcastable to number of goal dimensions. reward_scales: multiplicative scale for rewards. A scalar or 1D tensor, must be broadcastable to number of reward dimensions. weight_index: (integer) The context list index that specifies weight. weight_vector: (a number or a list or Numpy array) The weighting vector, broadcastable to `next_states`. summarize: (boolean) enable summary ops. termination_epsilon: terminate if dist is less than this quantity. state_indices: (a list of integers) list of state indices to select. goal_indices: (a list of integers) list of goal indices to select. vectorize: Return a vectorized form. norm: L1 or L2. epsilon: small offset to ensure non-negative/zero distance. Returns: A new tf.float32 [batch_size] rewards Tensor, and tf.float32 [batch_size] discounts tensor. """ del actions, rewards # Unused stats = {} record_tensor(next_states, state_indices, stats, 'next_states') states = index_states(states, state_indices) starting_states = index_states(starting_states, state_indices) next_states = index_states(next_states, state_indices) goals = index_states(contexts[0], goal_indices) if relative_context: goals = states + goals sq_dists = tf.squared_difference(next_states * state_scales, goals * goal_scales) dist = tf.reduce_sum(sq_dists, -1) #def normalized_dist(states): # dot_product = tf.matmul(states - starting_states, tf.transpose(goals - starting_states)) # return goals - starting_states - dot_product def projection_dist(states): inner = tf.multiply(states - starting_states, goals - starting_states) upper = tf.reduce_sum(inner, -1) sign = tf.sign(upper) result = tf.math.divide(upper, tf.norm(goals - starting_states, ord=2)) term_1 = tf.norm(states - starting_states, 2) return -1*term_1+result dist_s = projection_dist(states) dist_s = tf.sqrt(tf.square(dist_s) + epsilon) dist_ns = projection_dist(next_states) ret = dist_ns, tf.to_float(dist > termination_epsilon) return ret
def stability_loss(h, beta): if beta == 0.0: return 0.0 else: l2 = tf.sqrt(tf.reduce_sum(tf.square(h), axis=-1)) return beta * tf.reduce_mean(tf.square(l2[1:] - l2[:-1]))
def l2_norm(x, axis=2): squared = tf.reduce_sum(tf.square(x), axis=axis, keepdims=True) norm = tf.sqrt(tf.maximum(squared, 1e-6)) return norm
def linear_classifier(embeddings, num_classes, cosine_classifier, cosine_logits_multiplier, use_weight_norm, weight_decay): """Forward pass through a linear classifier, or possibly a cosine classifier. Args: embeddings: A Tensor of size [batch size, embedding dim]. num_classes: An integer; the dimension of the classification. cosine_classifier: A bool. If true, a cosine classifier is used, which does not require a bias. cosine_logits_multiplier: A float. Only used if cosine_classifier is True, and multiplies the resulting logits. use_weight_norm: A bool. Whether weight norm was used. If so, then if using cosine classifier, normalize only the embeddings but not the weights. weight_decay: A float; the scalar multiple on the L2 regularization of the weight matrix. Returns: logits: A Tensor of size [batch size, num outputs]. """ embedding_dims = embeddings.get_shape().as_list()[-1] if use_weight_norm: # A variable to keep track of whether the initialization has already # happened. data_dependent_init_done = tf.get_variable('data_dependent_init_done', initializer=0, dtype=tf.int32, trainable=False) w_fc = tf.get_variable('w_fc', [embedding_dims, num_classes], initializer=tf.random_normal_initializer( 0, 0.05), trainable=True) # This init is temporary as it needs to be done in a data-dependent way. # It will be overwritten during the first forward pass through this layer. g = tf.get_variable('g', dtype=tf.float32, initializer=tf.ones([num_classes]), trainable=True) b_fc = None if not cosine_classifier: # Also initialize a bias. b_fc = tf.get_variable('b_fc', initializer=tf.zeros([num_classes]), trainable=True) def _do_data_dependent_init(): """Returns ops for the data-dependent init of g and maybe b_fc.""" w_fc_normalized = tf.nn.l2_normalize(w_fc.read_value(), [0]) output_init = tf.matmul(embeddings, w_fc_normalized) mean_init, var_init = tf.nn.moments(output_init, [0]) # Data-dependent init values. g_init_value = 1. / tf.sqrt(var_init + 1e-10) ops = [tf.assign(g, g_init_value)] if not cosine_classifier: # Also initialize a bias in a data-dependent way. b_fc_init_value = -mean_init * g_init_value ops.append(tf.assign(b_fc, b_fc_init_value)) # Mark that the data-dependent initialization is done to prevent it from # happening again in the future. ops.append(tf.assign(data_dependent_init_done, 1)) return tf.group(*ops) # Possibly perform data-dependent init (if it hasn't been done already). init_op = tf.cond(tf.equal(data_dependent_init_done, 0), _do_data_dependent_init, tf.no_op) with tf.control_dependencies([init_op]): # Apply weight normalization. w_fc *= g / tf.sqrt(tf.reduce_sum(tf.square(w_fc), [0])) # Forward pass through the layer defined by w_fc and b_fc. logits = linear_classifier_forward_pass(embeddings, w_fc, b_fc, cosine_classifier, cosine_logits_multiplier, True) else: # No weight norm. w_fc = functional_backbones.weight_variable( [embedding_dims, num_classes], weight_decay=weight_decay) b_fc = None if not cosine_classifier: # Also initialize a bias. b_fc = functional_backbones.bias_variable([num_classes]) # Forward pass through the layer defined by w_fc and b_fc. logits = linear_classifier_forward_pass(embeddings, w_fc, b_fc, cosine_classifier, cosine_logits_multiplier, False) return logits