def parametrize(self, *, x, conditions): one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') shape = (-1, ) + self.action_spec.shape # Logit logit = self.logit.apply(x=x) if len(self.input_spec.shape) == 1: logit = tf.reshape(tensor=logit, shape=shape) # States value state_value = logit # Sigmoid for corresponding probability probability = tf.sigmoid(x=logit) # Clip probability for numerical stability probability = tf.clip_by_value(t=probability, clip_value_min=epsilon, clip_value_max=(one - epsilon)) # "Normalized" logits true_logit = tf.math.log(x=probability) false_logit = tf.math.log(x=(one - probability)) return TensorDict(true_logit=true_logit, false_logit=false_logit, probability=probability, state_value=state_value)
def mode(self, *, parameters): alpha, beta, alpha_beta = parameters.get( ('alpha', 'beta', 'alpha_beta')) # Distribution parameter tracking def fn_tracking(): return tf.math.reduce_mean(input_tensor=alpha, axis=0) dependencies = self.track(label='distribution', name='alpha', data=fn_tracking) def fn_tracking(): return tf.math.reduce_mean(input_tensor=beta, axis=0) dependencies.extend( self.track(label='distribution', name='beta', data=fn_tracking)) with tf.control_dependencies(control_inputs=dependencies): action = beta / alpha_beta min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') return min_value + (max_value - min_value) * action
def start(self, *, arguments, x_init, base_value, zero_value, estimated): """ Initialization step preparing the arguments for the first iteration of the loop body. Args: x_init: Initial solution guess $x_0$. base_value: Value $f(x')$ at $x = x'$. zero_value: Value $f(x_0)$ at $x = x_0$. estimated: Estimated value at $x = x_0$. Returns: Initial arguments for step. """ dependencies = list() if self.config.create_tf_assertions: zero_float = tf_util.constant(value=0.0, dtype='float') dependencies.append(tf.debugging.assert_greater_equal(x=estimated, y=zero_float)) with tf.control_dependencies(control_inputs=dependencies): zeros_x = x_init.fmap(function=tf.zeros_like) improvement = zero_value - base_value last_improvement = tf_util.constant(value=-1.0, dtype='float') return arguments, zeros_x, x_init, improvement, last_improvement, base_value, estimated
def retrieve_timesteps(self, *, n, past_horizon, future_horizon): one = tf_util.constant(value=1, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') # Check whether memory contains at least one valid timestep num_timesteps = tf.math.minimum(x=self.buffer_index, y=capacity) num_timesteps -= (past_horizon + future_horizon) num_timesteps = tf.math.maximum(x=num_timesteps, y=self.episode_count) # Check whether memory contains at least one timestep assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_greater_equal(x=num_timesteps, y=one)) # Randomly sampled timestep indices with tf.control_dependencies(control_inputs=assertions): n = tf.math.minimum(x=n, y=num_timesteps) indices = tf.random.uniform(shape=(n, ), maxval=num_timesteps, dtype=tf_util.get_dtype(type='int')) indices = tf.math.mod(x=(self.buffer_index - one - indices - future_horizon), y=capacity) return indices
def retrieve_episodes(self, *, n): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') # Check whether memory contains at least one episode assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_greater_equal(x=self.episode_count, y=one)) # Get start and limit indices for randomly sampled n episodes with tf.control_dependencies(control_inputs=assertions): n = tf.math.minimum(x=n, y=self.episode_count) random_indices = tf.random.uniform( shape=(n, ), maxval=self.episode_count, dtype=tf_util.get_dtype(type='int')) # (Increment terminal of previous episode) starts = tf.gather(params=self.terminal_indices, indices=random_indices) + one limits = tf.gather(params=self.terminal_indices, indices=(random_indices + one)) + one # Correct limit index if smaller than start index limits = limits + tf.where( condition=(limits < starts), x=capacity, y=zero) # Random episode indices ranges indices = tf.ragged.range(starts=starts, limits=limits).values indices = tf.math.mod(x=indices, y=capacity) return indices
def sample(self, *, parameters, temperature): logits, probabilities, action_values = parameters.get( ('logits', 'probabilities', 'action_values')) # Distribution parameter summaries def fn_summary(): axis = range(self.action_spec.rank + 1) probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis) return [probs[n] for n in range(self.action_spec.num_values)] prefix = 'distributions/' + self.name + '-probability' names = [prefix + str(n) for n in range(self.action_spec.num_values)] dependencies = self.summary(label='distribution', name=names, data=fn_summary, step='timesteps') # Entropy summary def fn_summary(): entropy = -tf.reduce_sum(input_tensor=(probabilities * logits), axis=-1) return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')) one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') # Deterministic: maximum likelihood action definite = tf.argmax(input=action_values, axis=-1) definite = tf_util.cast(x=definite, dtype='int') # Set logits to minimal value min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min) logits = logits / temperature logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=logits) # Non-deterministic: sample action using Gumbel distribution uniform_distribution = tf.random.uniform( shape=tf.shape(input=logits), minval=epsilon, maxval=(one - epsilon), dtype=tf_util.get_dtype(type='float')) gumbel_distribution = -tf.math.log( x=-tf.math.log(x=uniform_distribution)) sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1) sampled = tf_util.cast(x=sampled, dtype='int') with tf.control_dependencies(control_inputs=dependencies): return tf.where(condition=(temperature < epsilon), x=definite, y=sampled)
def parametrize(self, *, x, conditions): log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') shape = (-1,) + self.action_spec.shape # Mean mean = self.mean.apply(x=x) if len(self.input_spec.shape) == 1: mean = tf.reshape(tensor=mean, shape=shape) # Log standard deviation if self.global_stddev: multiples = (tf.shape(input=x)[0],) + tuple(1 for _ in range(self.action_spec.rank)) log_stddev = tf.tile(input=self.log_stddev, multiples=multiples) else: log_stddev = self.log_stddev.apply(x=x) if len(self.input_spec.shape) == 1: log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Shift log stddev to reduce zero value (TODO: 0.1 random choice) if self.action_spec.min_value is not None and self.action_spec.max_value is not None: log_stddev += tf_util.constant(value=np.log(0.1), dtype='float') # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative) log_stddev = tf.clip_by_value( t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon ) # Standard deviation stddev = tf.math.exp(x=log_stddev) return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
def parametrize(self, *, x, conditions): # Softplus to ensure alpha and beta >= 1 one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') shape = (-1,) + self.action_spec.shape # Alpha alpha = self.alpha.apply(x=x) # epsilon < 1.0, hence negative alpha = tf.clip_by_value(t=alpha, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) alpha = tf.math.softplus(features=alpha) + one if len(self.input_spec.shape) == 1: alpha = tf.reshape(tensor=alpha, shape=shape) # Beta beta = self.beta.apply(x=x) # epsilon < 1.0, hence negative beta = tf.clip_by_value(t=beta, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) beta = tf.math.softplus(features=beta) + one if len(self.input_spec.shape) == 1: beta = tf.reshape(tensor=beta, shape=shape) # Alpha + Beta alpha_beta = tf.maximum(x=(alpha + beta), y=epsilon) # Log norm log_norm = tf.math.lgamma(x=alpha) + tf.math.lgamma(x=beta) - tf.math.lgamma(x=alpha_beta) return TensorDict(alpha=alpha, beta=beta, alpha_beta=alpha_beta, log_norm=log_norm)
def next_step( self, *, arguments, x, deltas, improvement, last_improvement, base_value, estimated ): """ Termination condition: max number of iterations, or no improvement for last step, or improvement less than acceptable ratio, or estimated value not positive. Args: x: Current solution estimate $x_{t-1}$. deltas: Current difference $x_t - x_{t-1}$. improvement: Current improvement $(f(x_t) - f(x'))$. last_improvement: Last improvement $(f(x_{t-1}) - f(x'))$. base_value: Value $f(x')$ at $x = x'$. estimated: Current estimated value at $x_t$. Returns: True if another iteration should be performed. """ # Continue while current step is an improvement over last step zero_float = tf_util.constant(value=0.0, dtype='float') last_improvement = tf.math.maximum(x=last_improvement, y=zero_float) next_step = (improvement >= last_improvement) # Continue while estimated improvement is positive epsilon = tf_util.constant(value=util.epsilon, dtype='float') next_step = tf.math.logical_and(x=next_step, y=(estimated > epsilon)) # Continue while improvement ratio is below accept ratio, so not yet sufficient accept_ratio = self.accept_ratio.value() improvement_ratio = improvement / tf.math.maximum(x=estimated, y=epsilon) return tf.math.logical_and(x=next_step, y=(improvement_ratio < accept_ratio))
def retrieve_episodes(self, *, n): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') # Check whether memory contains at least one episode assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_greater_equal(x=self.episode_count, y=one)) # Get start and limit index for most recent n episodes with tf.control_dependencies(control_inputs=assertions): n = tf.math.minimum(x=n, y=self.episode_count) # (Increment terminal of previous episode) start = self.terminal_indices[self.episode_count - n] + one limit = self.terminal_indices[self.episode_count] + one # Correct limit index if smaller than start index limit = limit + tf.where( condition=(limit < start), x=capacity, y=zero) # Most recent episode indices range indices = tf.range(start=start, limit=limit) indices = tf.math.mod(x=indices, y=capacity) return indices
def mode(self, *, parameters): beta, alpha_beta = parameters.get(('beta', 'alpha_beta')) action = beta / alpha_beta min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') return min_value + (max_value - min_value) * action
def action_value(self, *, parameters, action): mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev')) # Inverse bounded transformation if self.bounded_transform is not None: if self.action_spec.min_value is not None and self.action_spec.max_value is not None: one = tf_util.constant(value=1.0, dtype='float') two = tf_util.constant(value=2.0, dtype='float') min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') action = two * (action - min_value) / (max_value - min_value) - one if self.bounded_transform == 'tanh': clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float') action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip) action = tf.math.atanh(x=action) half = tf_util.constant(value=0.5, dtype='float') two = tf_util.constant(value=2.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_two_pi = tf_util.constant(value=(np.log(2.0 * np.pi)), dtype='float') # TODO: why no e here, but for entropy? sq_mean_distance = tf.square(x=(action - mean)) sq_stddev = tf.maximum(x=tf.square(x=stddev), y=epsilon) action_value = -half * sq_mean_distance / sq_stddev - two * log_stddev - log_two_pi # Probably not needed? # if self.bounded_transform == 'tanh': # log_two = tf_util.constant(value=np.log(2.0), dtype='float') # action_value -= two * (log_two - action - tf.math.softplus(features=(-two * action))) return action_value
def sample(self, *, parameters, temperature): alpha, beta, alpha_beta, log_norm = parameters.get( ('alpha', 'beta', 'alpha_beta', 'log_norm')) # Distribution parameter summaries def fn_summary(): return tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)), \ tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1)) prefix = 'distributions/' + self.name names = (prefix + '-alpha', prefix + '-beta') dependencies = self.summary(label='distribution', name=names, data=fn_summary, step='timesteps') # Distribution parameter tracking def fn_tracking(): return tf.math.reduce_mean(input_tensor=alpha, axis=0) dependencies.extend( self.track(label='distribution', name='alpha', data=fn_tracking)) def fn_tracking(): return tf.math.reduce_mean(input_tensor=beta, axis=0) dependencies.extend( self.track(label='distribution', name='beta', data=fn_tracking)) epsilon = tf_util.constant(value=util.epsilon, dtype='float') def fn_mode(): # Deterministic: mean as action return beta / alpha_beta def fn_sample(): # Non-deterministic: sample action using gamma distribution alpha_sample = tf.random.gamma( shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float')) beta_sample = tf.random.gamma( shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float')) return beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=epsilon) action = tf.cond(pred=(temperature < epsilon), true_fn=fn_mode, false_fn=fn_sample) min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') with tf.control_dependencies(control_inputs=dependencies): return min_value + (max_value - min_value) * action
def log_probability(self, *, parameters, action): mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev')) # Inverse bounded transformation if self.bounded_transform is not None: if self.action_spec.min_value is not None and self.action_spec.max_value is not None: one = tf_util.constant(value=1.0, dtype='float') two = tf_util.constant(value=2.0, dtype='float') min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') action = two * (action - min_value) / (max_value - min_value) - one if self.bounded_transform == 'tanh': clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float') action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip) action = tf_util.cast(x=tf.math.atanh(x=tf_util.float32(x=action)), dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') half = tf_util.constant(value=0.5, dtype='float') half_log_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float') sq_mean_distance = tf.square(x=(action - mean)) sq_stddev = tf.maximum(x=tf.square(x=stddev), y=epsilon) log_prob = -half * sq_mean_distance / sq_stddev - log_stddev - half_log_two_pi if self.bounded_transform == 'tanh': log_two = tf_util.constant(value=np.log(2.0), dtype='float') log_prob -= two * (log_two - action - tf.math.softplus(features=(-two * action))) return log_prob
def reset(self): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') three = tf_util.constant(value=3, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity) def correct_terminal(): # Replace last observation terminal marker with abort terminal dependencies = list() two = tf_util.constant(value=2, dtype='int') sparse_delta = tf.IndexedSlices(values=two, indices=last_index) dependencies.append(self.buffers['terminal'].scatter_update( sparse_delta=sparse_delta)) sparse_delta = tf.IndexedSlices(values=last_index, indices=(self.episode_count + one)) dependencies.append( self.terminal_indices.scatter_update( sparse_delta=sparse_delta)) with tf.control_dependencies(control_inputs=dependencies): return self.episode_count.assign_add(delta=one, read_value=False) last_terminal = tf.gather(params=self.buffers['terminal'], indices=last_index) is_incorrect = tf.math.equal(x=last_terminal, y=three) corrected = tf.cond(pred=is_incorrect, true_fn=correct_terminal, false_fn=tf.no_op) with tf.control_dependencies(control_inputs=(corrected, )): assertions = [corrected] if self.config.create_tf_assertions: # general check: all terminal indices true assertions.append( tf.debugging.assert_equal( x=tf.reduce_all(input_tensor=tf.gather( params=tf.math.greater(x=self.buffers['terminal'], y=zero), indices=self.terminal_indices[:self.episode_count + one])), y=tf_util.constant(value=True, dtype='bool'), message="Memory consistency check.")) # general check: only terminal indices true assertions.append( tf.debugging.assert_equal( x=tf.math.count_nonzero( input=self.buffers['terminal'], dtype=tf_util.get_dtype(type='int')), y=(self.episode_count + one), message="Memory consistency check.")) with tf.control_dependencies(control_inputs=assertions): return one < zero
def apply(self, *, x): is_inf = np.logical_or(np.isinf(self.min_value), np.isinf(self.max_value)) is_inf = tf_util.constant(value=is_inf, dtype='bool') min_value = tf_util.constant(value=self.min_value, dtype='float') max_value = tf_util.constant(value=self.max_value, dtype='float') return tf.where(condition=is_inf, x=x, y=(4.0 * (x - min_value) / (max_value - min_value) - 2.0))
def sample(self, *, parameters, temperature): alpha, beta, alpha_beta, log_norm = parameters.get( ('alpha', 'beta', 'alpha_beta', 'log_norm') ) # Distribution parameter summaries def fn_summary(): return tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)), \ tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1)) prefix = 'distributions/' + self.name dependencies = self.summary( label='distribution', name=(prefix + '-alpha', prefix + '-beta'), data=fn_summary, step='timesteps' ) # Entropy summary def fn_summary(): one = tf_util.constant(value=1.0, dtype='float') digamma_alpha = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float' ) digamma_beta = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float') digamma_alpha_beta = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float' ) entropy = log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \ (alpha_beta - one - one) * digamma_alpha_beta return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps') ) epsilon = tf_util.constant(value=util.epsilon, dtype='float') # Deterministic: mean as action definite = beta / alpha_beta # Non-deterministic: sample action using gamma distribution alpha_sample = tf.random.gamma(shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float')) beta_sample = tf.random.gamma(shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float')) sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=epsilon) action = tf.where(condition=(temperature < epsilon), x=definite, y=sampled) min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') with tf.control_dependencies(control_inputs=dependencies): return min_value + (max_value - min_value) * action
def apply(self, *, x, independent): dependencies = list() if independent: mean = self.moving_mean variance = self.moving_variance else: one = tf_util.constant(value=1.0, dtype='float') axes = (0, ) + tuple(1 + axis for axis in self.axes) decay = self.decay.value() batch_size = tf_util.cast(x=tf.shape(input=x)[0], dtype='float') decay = tf.math.pow(x=decay, y=batch_size) condition = tf.math.logical_or(x=self.after_first_call, y=tf.math.equal(x=batch_size, y=0)) mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) mean = tf.where(condition=condition, x=(decay * self.moving_mean + (one - decay) * mean), y=mean) variance = tf.reduce_mean(input_tensor=tf.math.squared_difference( x=x, y=mean), axis=axes, keepdims=True) variance = tf.where(condition=condition, x=(decay * self.moving_variance + (one - decay) * variance), y=variance) with tf.control_dependencies(control_inputs=(mean, variance)): value = tf.math.logical_or(x=self.after_first_call, y=(batch_size > 0)) dependencies.append( self.after_first_call.assign(value=value, read_value=False)) mean = self.moving_mean.assign(value=mean) variance = self.moving_variance.assign(value=variance) epsilon = tf_util.constant(value=util.epsilon, dtype='float') reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=epsilon)) with tf.control_dependencies(control_inputs=dependencies): x = (x - tf.stop_gradient(input=mean)) * tf.stop_gradient( input=reciprocal_stddev) return x
def kl_divergence(self, *, parameters1, parameters2): mean1, stddev1, log_stddev1 = parameters1.get(('mean', 'stddev', 'log_stddev')) mean2, stddev2, log_stddev2 = parameters2.get(('mean', 'stddev', 'log_stddev')) half = tf_util.constant(value=0.5, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_stddev_ratio = log_stddev2 - log_stddev1 sq_mean_distance = tf.square(x=(mean1 - mean2)) sq_stddev1 = tf.square(x=stddev1) sq_stddev2 = tf.maximum(x=tf.square(x=stddev2), y=epsilon) return log_stddev_ratio + half * (sq_stddev1 + sq_mean_distance) / sq_stddev2 - half
def loss(self, *, states, horizons, internals, auxiliaries, actions, reward, policy, reference): if not self.early_reduce: reward = tf.expand_dims(input=reward, axis=1) if self.value == 'state': value = policy.states_value(states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, reduced=self.early_reduce, return_per_action=False) elif self.value == 'action': value = policy.actions_value(states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, actions=actions, reduced=self.early_reduce, return_per_action=False) difference = value - reward zero = tf_util.constant(value=0.0, dtype='float') half = tf_util.constant(value=0.5, dtype='float') huber_loss = self.huber_loss.value() skip_huber_loss = tf.math.equal(x=huber_loss, y=zero) def no_huber_loss(): return half * tf.math.square(x=difference) def apply_huber_loss(): inside_huber_bounds = tf.math.less_equal( x=tf.math.abs(x=difference), y=huber_loss) quadratic = half * tf.math.square(x=difference) linear = huber_loss * (tf.math.abs(x=difference) - half * huber_loss) return tf.where(condition=inside_huber_bounds, x=quadratic, y=linear) loss = tf.cond(pred=skip_huber_loss, true_fn=no_huber_loss, false_fn=apply_huber_loss) if not self.early_reduce: loss = tf.math.reduce_mean(input_tensor=loss, axis=1) return loss
def log_probability(self, *, parameters, action): alpha, beta, log_norm = parameters.get(('alpha', 'beta', 'log_norm')) min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') action = (action - min_value) / (max_value - min_value) one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') return tf.math.xlogy(x=(beta - one), y=(action + epsilon)) + \ (alpha - one) * tf.math.log1p(x=(-action + epsilon)) - log_norm
def parametrize(self, *, x, conditions): log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') shape = (-1, ) + self.action_spec.shape # Mean mean = self.mean.apply(x=x) if len(self.input_spec.shape) == 1: mean = tf.reshape(tensor=mean, shape=shape) # Log standard deviation if self.global_stddev: log_stddev = self.log_stddev else: log_stddev = self.log_stddev.apply(x=x) if len(self.input_spec.shape) == 1: log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative) log_stddev = tf.clip_by_value(t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) # Standard deviation stddev = tf.exp(x=log_stddev) return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
def mode(self, *, parameters, independent): probability = parameters['probability'] # Distribution parameter summaries dependencies = list() if not independent: def fn_summary(): axis = range(self.action_spec.rank + 1) return tf.math.reduce_mean(input_tensor=probability, axis=axis) name = 'distributions/' + self.name + '-probability' dependencies.extend( self.summary(label='distribution', name=name, data=fn_summary, step='timesteps')) # Distribution parameter tracking def fn_tracking(): return tf.math.reduce_mean(input_tensor=probability, axis=0) dependencies.extend( self.track(label='distribution', name='probability', data=fn_tracking)) with tf.control_dependencies(control_inputs=dependencies): return tf.greater_equal(x=probability, y=tf_util.constant(value=0.5, dtype='float'))
def sample(self, *, states, horizons, internals, auxiliaries, temperature, independent): deterministic = tf_util.constant(value=False, dtype='bool') embedding, internals = self.network.apply(x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=independent) def function(name, distribution, temp): conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize(x=embedding, conditions=conditions) return distribution.sample(parameters=parameters, temperature=temp) if isinstance(self.temperature, dict): actions = self.distributions.fmap(function=function, cls=TensorDict, with_names=True, zip_values=(temperature, )) else: actions = self.distributions.fmap(function=partial( function, temp=temperature), cls=TensorDict, with_names=True) return actions, internals
def fn_terminal(): operations = list() # Reset internals def function(spec, initial): return tf_util.constant(value=initial, dtype=spec.type) initials = self.internals_spec.fmap( function=function, cls=TensorDict, zip_values=self.internals_init ) for name, previous, initial in self.previous_internals.zip_items(initials): sparse_delta = tf.IndexedSlices(values=initial, indices=parallel) operations.append(previous.scatter_update(sparse_delta=sparse_delta)) # Episode reward summaries (before episode reward reset / episodes increment) if self.summary_labels == 'all' or 'reward' in self.summary_labels: with self.summarizer.as_default(): x = tf.gather(params=self.episode_reward, indices=parallel) tf.summary.scalar(name='episode-reward', data=x, step=self.episodes) # Reset episode reward zero_float = tf_util.constant(value=0.0, dtype='float') sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel) operations.append(self.episode_reward.scatter_update(sparse_delta=sparse_delta)) # Increment episodes counter operations.append(self.episodes.assign_add(delta=one, read_value=False)) return tf.group(*operations)
def regularize(self): zero = tf_util.constant(value=0.0, dtype='float') module = self while module.l2_regularization is None: module = module.parent if len(self.this_trainable_variables) == 0 or \ module.l2_regularization.is_constant(value=0.0): regularization_loss = zero else: l2_regularization = module.l2_regularization.value() def no_l2_regularization(): return zero def apply_l2_regularization(): l2_variables = list() for variable in self.this_trainable_variables: variable = tf_util.cast(x=variable, dtype='float') l2_variables.append( tf.reduce_sum(input_tensor=tf.square(x=variable))) return l2_regularization * tf.math.add_n(inputs=l2_variables) skip_l2_regularization = tf.math.equal(x=l2_regularization, y=zero) regularization_loss = tf.cond(pred=skip_l2_regularization, true_fn=no_l2_regularization, false_fn=apply_l2_regularization) for module in self.this_submodules: if isinstance(module, Module) and module.is_trainable: regularization_loss += module.regularize() return regularization_loss
def entropy(self, *, parameters): true_logit, false_logit, probability = parameters.get( ('true_logit', 'false_logit', 'probability')) one = tf_util.constant(value=1.0, dtype='float') return -probability * true_logit - (one - probability) * false_logit
def state_value(self, *, parameters): log_stddev = parameters['log_stddev'] half_lg_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float') # TODO: why no e here, but for entropy? return -log_stddev - half_lg_two_pi
def action_values(self, *, states, horizons, internals, auxiliaries, actions): deterministic = tf_util.constant(value=True, dtype='bool') embedding, _ = self.network.apply(x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=True) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) def function(name, distribution, action): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize(x=x, conditions=conditions) return distribution.action_value(parameters=parameters, action=action) return self.distributions.fmap(function=function, cls=TensorDict, with_names=True, zip_values=actions)
def sample(self, *, parameters, temperature): true_logit, false_logit, probability = parameters.get( ('true_logit', 'false_logit', 'probability')) # Distribution parameter summaries def fn_summary(): axis = range(self.action_spec.rank + 1) return tf.math.reduce_mean(input_tensor=probability, axis=axis) name = 'distributions/' + self.name + '-probability' dependencies = self.summary(label='distribution', name=name, data=fn_summary, step='timesteps') # Entropy summary def fn_summary(): one = tf_util.constant(value=1.0, dtype='float') entropy = -probability * true_logit - (one - probability) * false_logit return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')) half = tf_util.constant(value=0.5, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') # Deterministic: true if >= 0.5 definite = tf.greater_equal(x=probability, y=half) # Non-deterministic: sample true if >= uniform distribution e_true_logit = tf.math.exp(x=(true_logit / temperature)) e_false_logit = tf.math.exp(x=(false_logit / temperature)) probability = e_true_logit / (e_true_logit + e_false_logit) uniform = tf.random.uniform(shape=tf.shape(input=probability), dtype=tf_util.get_dtype(type='float')) sampled = tf.greater_equal(x=probability, y=uniform) with tf.control_dependencies(control_inputs=dependencies): return tf.where(condition=(temperature < epsilon), x=definite, y=sampled)