class Categorical(Distribution): """ Categorical distribution, for discrete actions. """ def __init__(self, shape, num_actions, probabilities=None, scope='categorical', summary_labels=()): """ Categorical distribution. Args: shape: Action shape. num_actions: Number of discrete action alternatives. probabilities: Optional distribution bias. """ self.num_actions = num_actions action_size = util.prod(shape) * self.num_actions if probabilities is None: logits = 0.0 else: logits = [ log(prob) for _ in range(util.prod(shape)) for prob in probabilities ] self.logits = Linear(size=action_size, bias=logits, scope='logits') super(Categorical, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) def tf_parameterize(self, x): # Flat logits logits = self.logits.apply(x=x) # Reshape logits to action shape shape = (-1, ) + self.shape + (self.num_actions, ) logits = tf.reshape(tensor=logits, shape=shape) # !!! state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=logits, axis=-1) # Min epsilon probability for numerical stability probabilities = tf.maximum(x=probabilities, y=util.epsilon) # "Normalized" logits logits = tf.log(x=probabilities) return logits, probabilities, state_value def state_value(self, distr_params): _, _, state_value = distr_params return state_value def state_action_value(self, distr_params, action=None): logits, _, state_value = distr_params if action is None: state_value = tf.expand_dims(input=state_value, axis=-1) else: one_hot = tf.one_hot(indices=action, depth=self.num_actions) logits = tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1) return state_value + logits def tf_sample(self, distr_params, deterministic): logits, _, _ = distr_params # Deterministic: maximum likelihood action definite = tf.argmax(input=logits, axis=-1, output_type=util.tf_dtype('int')) # Non-deterministic: sample action using Gumbel distribution uniform_distribution = tf.random_uniform(shape=tf.shape(input=logits), minval=util.epsilon, maxval=(1.0 - util.epsilon)) gumbel_distribution = -tf.log(x=-tf.log(x=uniform_distribution)) sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1, output_type=util.tf_dtype('int')) return tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): logits, _, _ = distr_params one_hot = tf.one_hot(indices=action, depth=self.num_actions) return tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1) def tf_entropy(self, distr_params): logits, probabilities, _ = distr_params return -tf.reduce_sum(input_tensor=(probabilities * logits), axis=-1) def tf_kl_divergence(self, distr_params1, distr_params2): logits1, probabilities1, _ = distr_params1 logits2, _, _ = distr_params2 log_prob_ratio = logits1 - logits2 return tf.reduce_sum(input_tensor=(probabilities1 * log_prob_ratio), axis=-1) def tf_regularization_loss(self): regularization_loss = super(Categorical, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.logits.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_nontrainable=False): distribution_variables = super( Categorical, self).get_variables(include_nontrainable=include_nontrainable) logits_variables = self.logits.get_variables( include_nontrainable=include_nontrainable) return distribution_variables + logits_variables def get_summaries(self): distribution_summaries = super(Categorical, self).get_summaries() logits_summaries = self.logits.get_summaries() return distribution_summaries + logits_summaries
class Beta(Distribution): """ Beta distribution, for bounded continuous actions """ def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()): """ Beta distribution used for continuous actions. In particular, the Beta distribution allows to bound action values with min and max values. Args: shape: Shape of actions min_value: Min value of all actions for the given shape max_value: Max value of all actions for the given shape alpha: Concentration parameter of the Beta distribution beta: Concentration parameter of the Beta distribution """ assert min_value is None or max_value > min_value self.shape = shape self.min_value = min_value self.max_value = max_value action_size = util.prod(self.shape) self.alpha = Linear(size=action_size, bias=alpha, scope='alpha') self.beta = Linear(size=action_size, bias=beta, scope='beta') super(Beta, self).__init__(scope, summary_labels) def tf_parameterize(self, x): # Softplus to ensure alpha and beta >= 1 # epsilon < 1.0, hence negative log_eps = log(util.epsilon) alpha = self.alpha.apply(x=x) alpha = tf.clip_by_value(t=alpha, clip_value_min=log_eps, clip_value_max=-log_eps) alpha = tf.log(x=(tf.exp(x=alpha) + 1.0)) + 1.0 beta = self.beta.apply(x=x) beta = tf.clip_by_value(t=beta, clip_value_min=log_eps, clip_value_max=-log_eps) beta = tf.log(x=(tf.exp(x=beta) + 1.0)) + 1.0 shape = (-1, ) + self.shape alpha = tf.reshape(tensor=alpha, shape=shape) beta = tf.reshape(tensor=beta, shape=shape) alpha_beta = tf.maximum(x=(alpha + beta), y=util.epsilon) log_norm = tf.lgamma(x=alpha) + tf.lgamma(x=beta) - tf.lgamma( x=alpha_beta) return alpha, beta, alpha_beta, log_norm def tf_sample(self, distr_params, deterministic): alpha, beta, alpha_beta, _ = distr_params # Deterministic: mean as action definite = beta / alpha_beta # Non-deterministic: sample action using gamma distribution alpha_sample = tf.random_gamma(shape=(), alpha=alpha) beta_sample = tf.random_gamma(shape=(), alpha=beta) sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=util.epsilon) return self.min_value + (self.max_value - self.min_value) * \ tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): alpha, beta, _, log_norm = distr_params action = (action - self.min_value) / (self.max_value - self.min_value) action = tf.minimum(x=action, y=(1.0 - util.epsilon)) return (beta - 1.0) * tf.log(x=tf.maximum(x=action, y=util.epsilon)) + \ (alpha - 1.0) * tf.log1p(x=-action) - log_norm def tf_entropy(self, distr_params): alpha, beta, alpha_beta, log_norm = distr_params return log_norm - (beta - 1.0) * tf.digamma(x=beta) - (alpha - 1.0) * tf.digamma(x=alpha) + \ (alpha_beta - 2.0) * tf.digamma(x=alpha_beta) def tf_kl_divergence(self, distr_params1, distr_params2): alpha1, beta1, alpha_beta1, log_norm1 = distr_params1 alpha2, beta2, alpha_beta2, log_norm2 = distr_params2 return log_norm2 - log_norm1 - tf.digamma(x=beta1) * (beta2 - beta1) - \ tf.digamma(x=alpha1) * (alpha2 - alpha1) + tf.digamma(x=alpha_beta1) * (alpha_beta2 - alpha_beta1) def tf_regularization_loss(self): regularization_loss = super(Beta, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.alpha.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.beta.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): distribution_variables = super( Beta, self).get_variables(include_non_trainable=include_non_trainable) alpha_variables = self.alpha.get_variables( include_non_trainable=include_non_trainable) beta_variables = self.beta.get_variables( include_non_trainable=include_non_trainable) return distribution_variables + alpha_variables + beta_variables def get_summaries(self): distribution_summaries = super(Beta, self).get_summaries() alpha_summaries = self.alpha.get_summaries() beta_summaries = self.beta.get_summaries() return distribution_summaries + alpha_summaries + beta_summaries
class NetworkBaseline(Baseline): """ Baseline based on a TensorForce network, used when parameters are shared between the value function and the baseline. """ def __init__(self, network_spec, scope='network-baseline', summary_labels=()): """ Network baseline. Args: network_spec: Network specification dict """ self.network = Network.from_spec( spec=network_spec, kwargs=dict(summary_labels=summary_labels)) assert len(self.network.internals_input()) == 0 self.linear = Linear(size=1, bias=0.0, scope='prediction') super(NetworkBaseline, self).__init__(scope, summary_labels) def tf_predict(self, states, update): embedding = self.network.apply(x=states, internals=(), update=update) prediction = self.linear.apply(x=embedding) return tf.squeeze(input=prediction, axis=1) def tf_regularization_loss(self): """ Creates the TensorFlow operations for the baseline regularization loss. Returns: Regularization loss tensor """ regularization_loss = super(NetworkBaseline, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.network.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.linear.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): baseline_variables = super( NetworkBaseline, self).get_variables(include_non_trainable=include_non_trainable) network_variables = self.network.get_variables( include_non_trainable=include_non_trainable) layer_variables = self.linear.get_variables( include_non_trainable=include_non_trainable) return baseline_variables + network_variables + layer_variables def get_summaries(self): baseline_summaries = super(NetworkBaseline, self).get_summaries() network_summaries = self.network.get_summaries() layer_summaries = self.linear.get_summaries() return baseline_summaries + network_summaries + layer_summaries
class Bernoulli(Distribution): """ Bernoulli distribution for binary actions. """ def __init__(self, shape, probability=0.5, scope='bernoulli', summary_labels=()): self.shape = shape action_size = util.prod(self.shape) with tf.name_scope(name=scope): self.logit = Linear(size=action_size, bias=log(probability), scope='logit') super(Bernoulli, self).__init__(scope, summary_labels) def tf_parameterize(self, x): # Flat logit logit = self.logit.apply(x=x) # Reshape logit to action shape shape = (-1, ) + self.shape logit = tf.reshape(tensor=logit, shape=shape) #TODO rename state_value = logit # Sigmoid for corresponding probability probability = tf.sigmoid(x=logit) # Min epsilon probability for numerical stability probability = tf.clip_by_value(t=probability, clip_value_min=util.epsilon, clip_value_max=(1.0 - util.epsilon)) # "Normalized" logits true_logit = tf.log(x=probability) false_logit = tf.log(x=(1.0 - probability)) return true_logit, false_logit, probability, state_value def state_value(self, distr_params): _, _, _, state_value = distr_params return state_value def state_action_value(self, distr_params, action): true_logit, false_logit, _, state_value = distr_params logit = tf.where(condition=action, x=true_logit, y=false_logit) return logit + state_value def tf_sample(self, distr_params, deterministic): _, _, probability, _ = distr_params # Deterministic: true if >= 0.5 definite = tf.greater_equal(x=probability, y=0.5) # Non-deterministic: sample true if >= uniform distribution uniform = tf.random_uniform(shape=tf.shape(probability)) sampled = tf.greater_equal(x=probability, y=uniform) return tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): true_logit, false_logit, _, _ = distr_params return tf.where(condition=action, x=true_logit, y=false_logit) def tf_entropy(self, distr_params): true_logit, false_logit, probability, _ = distr_params return -probability * true_logit - (1.0 - probability) * false_logit def tf_kl_divergence(self, distr_params1, distr_params2): true_logit1, false_logit1, probability1, _ = distr_params1 true_logit2, false_logit2, _, _ = distr_params2 true_log_prob_ratio = true_logit1 - true_logit2 false_log_prob_ratio = false_logit1 - false_logit2 return probability1 * true_log_prob_ratio + ( 1.0 - probability1) * false_log_prob_ratio def tf_regularization_loss(self): regularization_loss = super(Bernoulli, self).tf_regularization_loss() if super(Bernoulli, self).tf_regularization_loss() is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.logit.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): distribution_variables = super( Bernoulli, self).get_variables(include_non_trainable=include_non_trainable) logit_variables = self.logit.get_variables( include_non_trainable=include_non_trainable) return distribution_variables + logit_variables def get_summaries(self): distribution_summaries = super(Bernoulli, self).get_summaries() logit_summaries = self.logit.get_summaries() return distribution_summaries + logit_summaries
class Gaussian(Distribution): """ Gaussian distribution, for unbounded continuous actions. """ def __init__(self, shape, mean=0.0, log_stddev=0.0, scope='gaussian', summary_labels=()): """ Categorical distribution. Args: shape: Action shape. mean: Optional distribution bias for the mean. log_stddev: Optional distribution bias for the standard deviation. """ self.shape = shape action_size = util.prod(self.shape) self.mean = Linear(size=action_size, bias=mean, scope='mean') self.log_stddev = Linear(size=action_size, bias=log_stddev, scope='log-stddev') super(Gaussian, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) def tf_parameterize(self, x): # Flat mean and log standard deviation mean = self.mean.apply(x=x) log_stddev = self.log_stddev.apply(x=x) # Reshape mean and log stddev to action shape shape = (-1,) + self.shape mean = tf.reshape(tensor=mean, shape=shape) log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Clip log stddev for numerical stability log_eps = log(util.epsilon) # epsilon < 1.0, hence negative log_stddev = tf.clip_by_value(t=log_stddev, clip_value_min=log_eps, clip_value_max=-log_eps) # Standard deviation stddev = tf.exp(x=log_stddev) return mean, stddev, log_stddev def state_value(self, distr_params): _, _, log_stddev = distr_params return -log_stddev - 0.5 * log(2.0 * pi) def state_action_value(self, distr_params, action): mean, stddev, log_stddev = distr_params sq_mean_distance = tf.square(x=(action - mean)) sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon) return -0.5 * sq_mean_distance / sq_stddev - 2.0 * log_stddev - log(2.0 * pi) def tf_sample(self, distr_params, deterministic): mean, stddev, _ = distr_params # Deterministic: mean as action definite = mean # Non-deterministic: sample action using default normal distribution normal_distribution = tf.random_normal(shape=tf.shape(input=mean)) sampled = mean + stddev * normal_distribution return tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): mean, stddev, log_stddev = distr_params sq_mean_distance = tf.square(x=(action - mean)) sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon) return -0.5 * sq_mean_distance / sq_stddev - log_stddev - 0.5 * log(2.0 * pi) def tf_entropy(self, distr_params): _, _, log_stddev = distr_params return log_stddev + 0.5 * log(2.0 * pi * e) def tf_kl_divergence(self, distr_params1, distr_params2): mean1, stddev1, log_stddev1 = distr_params1 mean2, stddev2, log_stddev2 = distr_params2 log_stddev_ratio = log_stddev2 - log_stddev1 sq_mean_distance = tf.square(x=(mean1 - mean2)) sq_stddev1 = tf.square(x=stddev1) sq_stddev2 = tf.maximum(x=tf.square(x=stddev2), y=util.epsilon) return log_stddev_ratio + 0.5 * (sq_stddev1 + sq_mean_distance) / sq_stddev2 - 0.5 def tf_regularization_loss(self): regularization_loss = super(Gaussian, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.mean.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.log_stddev.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_nontrainable=False): distribution_variables = super(Gaussian, self).get_variables(include_nontrainable=include_nontrainable) mean_variables = self.mean.get_variables(include_nontrainable=include_nontrainable) log_stddev_variables = self.log_stddev.get_variables(include_nontrainable=include_nontrainable) return distribution_variables + mean_variables + log_stddev_variables def get_summaries(self): distribution_summaries = super(Gaussian, self).get_summaries() mean_summaries = self.mean.get_summaries() log_stddev_summaries = self.log_stddev.get_summaries() return distribution_summaries + mean_summaries + log_stddev_summaries
class AggregatedBaseline(Baseline): """ Baseline which aggregates per-state baselines. """ def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()): """ Aggregated baseline. Args: baselines: Dict of per-state baseline specification dicts """ with tf.name_scope(name=scope): self.baselines = dict() for name, baseline_spec in baselines.items(): with tf.name_scope(name=(name + '-baseline')): self.baselines[name] = Baseline.from_spec( spec=baseline_spec, kwargs=dict(summary_labels=summary_labels)) self.linear = Linear(size=1, bias=0.0, scope='prediction') super(AggregatedBaseline, self).__init__(scope, summary_labels) def tf_predict(self, states, update): predictions = list() for name, state in states.items(): prediction = self.baselines[name].predict(states=state, update=update) predictions.append(prediction) predictions = tf.stack(values=predictions, axis=1) prediction = self.linear.apply(x=predictions) return tf.squeeze(input=prediction, axis=1) def tf_regularization_loss(self): regularization_loss = super(AggregatedBaseline, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] for baseline in self.baselines.values(): regularization_loss = baseline.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.linear.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): baseline_variables = super( AggregatedBaseline, self).get_variables(include_non_trainable=include_non_trainable) baselines_variables = [ variable for name in sorted(self.baselines) for variable in self.baselines[name].get_variables( include_non_trainable=include_non_trainable) ] linear_variables = self.linear.get_variables( include_non_trainable=include_non_trainable) return baseline_variables + baselines_variables + linear_variables def get_summaries(self): baseline_summaries = super(AggregatedBaseline, self).get_summaries() baselines_summaries = [ variable for name in sorted(self.baselines) for variable in self.baselines[name].get_summaries() ] linear_summaries = self.linear.get_summaries() return baseline_summaries + baselines_summaries + linear_summaries