示例#1
0
class Categorical(Distribution):
    """
    Categorical distribution, for discrete actions.
    """
    def __init__(self,
                 shape,
                 num_actions,
                 probabilities=None,
                 scope='categorical',
                 summary_labels=()):
        """
        Categorical distribution.

        Args:
            shape: Action shape.
            num_actions: Number of discrete action alternatives.
            probabilities: Optional distribution bias.
        """
        self.num_actions = num_actions

        action_size = util.prod(shape) * self.num_actions
        if probabilities is None:
            logits = 0.0
        else:
            logits = [
                log(prob) for _ in range(util.prod(shape))
                for prob in probabilities
            ]
        self.logits = Linear(size=action_size, bias=logits, scope='logits')

        super(Categorical, self).__init__(shape=shape,
                                          scope=scope,
                                          summary_labels=summary_labels)

    def tf_parameterize(self, x):
        # Flat logits
        logits = self.logits.apply(x=x)

        # Reshape logits to action shape
        shape = (-1, ) + self.shape + (self.num_actions, )
        logits = tf.reshape(tensor=logits, shape=shape)

        # !!!
        state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=logits, axis=-1)

        # Min epsilon probability for numerical stability
        probabilities = tf.maximum(x=probabilities, y=util.epsilon)

        # "Normalized" logits
        logits = tf.log(x=probabilities)

        return logits, probabilities, state_value

    def state_value(self, distr_params):
        _, _, state_value = distr_params
        return state_value

    def state_action_value(self, distr_params, action=None):
        logits, _, state_value = distr_params
        if action is None:
            state_value = tf.expand_dims(input=state_value, axis=-1)
        else:
            one_hot = tf.one_hot(indices=action, depth=self.num_actions)
            logits = tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1)
        return state_value + logits

    def tf_sample(self, distr_params, deterministic):
        logits, _, _ = distr_params

        # Deterministic: maximum likelihood action
        definite = tf.argmax(input=logits,
                             axis=-1,
                             output_type=util.tf_dtype('int'))

        # Non-deterministic: sample action using Gumbel distribution
        uniform_distribution = tf.random_uniform(shape=tf.shape(input=logits),
                                                 minval=util.epsilon,
                                                 maxval=(1.0 - util.epsilon))
        gumbel_distribution = -tf.log(x=-tf.log(x=uniform_distribution))
        sampled = tf.argmax(input=(logits + gumbel_distribution),
                            axis=-1,
                            output_type=util.tf_dtype('int'))

        return tf.where(condition=deterministic, x=definite, y=sampled)

    def tf_log_probability(self, distr_params, action):
        logits, _, _ = distr_params
        one_hot = tf.one_hot(indices=action, depth=self.num_actions)
        return tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1)

    def tf_entropy(self, distr_params):
        logits, probabilities, _ = distr_params
        return -tf.reduce_sum(input_tensor=(probabilities * logits), axis=-1)

    def tf_kl_divergence(self, distr_params1, distr_params2):
        logits1, probabilities1, _ = distr_params1
        logits2, _, _ = distr_params2
        log_prob_ratio = logits1 - logits2
        return tf.reduce_sum(input_tensor=(probabilities1 * log_prob_ratio),
                             axis=-1)

    def tf_regularization_loss(self):
        regularization_loss = super(Categorical, self).tf_regularization_loss()
        if regularization_loss is None:
            losses = list()
        else:
            losses = [regularization_loss]

        regularization_loss = self.logits.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        if len(losses) > 0:
            return tf.add_n(inputs=losses)
        else:
            return None

    def get_variables(self, include_nontrainable=False):
        distribution_variables = super(
            Categorical,
            self).get_variables(include_nontrainable=include_nontrainable)
        logits_variables = self.logits.get_variables(
            include_nontrainable=include_nontrainable)

        return distribution_variables + logits_variables

    def get_summaries(self):
        distribution_summaries = super(Categorical, self).get_summaries()
        logits_summaries = self.logits.get_summaries()

        return distribution_summaries + logits_summaries
示例#2
0
class Beta(Distribution):
    """
    Beta distribution, for bounded continuous actions
    """
    def __init__(self,
                 shape,
                 min_value,
                 max_value,
                 alpha=0.0,
                 beta=0.0,
                 scope='beta',
                 summary_labels=()):
        """
        Beta distribution used for continuous actions. In particular, the Beta distribution
        allows to bound action values with min and max values.

        Args:
            shape: Shape of actions
            min_value: Min value of all actions for the given shape
            max_value: Max value of all actions for the given shape
            alpha: Concentration parameter of the Beta distribution
            beta: Concentration parameter of the Beta distribution
        """
        assert min_value is None or max_value > min_value
        self.shape = shape
        self.min_value = min_value
        self.max_value = max_value
        action_size = util.prod(self.shape)

        self.alpha = Linear(size=action_size, bias=alpha, scope='alpha')
        self.beta = Linear(size=action_size, bias=beta, scope='beta')

        super(Beta, self).__init__(scope, summary_labels)

    def tf_parameterize(self, x):
        # Softplus to ensure alpha and beta >= 1
        # epsilon < 1.0, hence negative
        log_eps = log(util.epsilon)

        alpha = self.alpha.apply(x=x)
        alpha = tf.clip_by_value(t=alpha,
                                 clip_value_min=log_eps,
                                 clip_value_max=-log_eps)
        alpha = tf.log(x=(tf.exp(x=alpha) + 1.0)) + 1.0

        beta = self.beta.apply(x=x)
        beta = tf.clip_by_value(t=beta,
                                clip_value_min=log_eps,
                                clip_value_max=-log_eps)
        beta = tf.log(x=(tf.exp(x=beta) + 1.0)) + 1.0

        shape = (-1, ) + self.shape
        alpha = tf.reshape(tensor=alpha, shape=shape)
        beta = tf.reshape(tensor=beta, shape=shape)

        alpha_beta = tf.maximum(x=(alpha + beta), y=util.epsilon)
        log_norm = tf.lgamma(x=alpha) + tf.lgamma(x=beta) - tf.lgamma(
            x=alpha_beta)

        return alpha, beta, alpha_beta, log_norm

    def tf_sample(self, distr_params, deterministic):
        alpha, beta, alpha_beta, _ = distr_params

        # Deterministic: mean as action
        definite = beta / alpha_beta

        # Non-deterministic: sample action using gamma distribution
        alpha_sample = tf.random_gamma(shape=(), alpha=alpha)
        beta_sample = tf.random_gamma(shape=(), alpha=beta)

        sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample),
                                           y=util.epsilon)

        return self.min_value + (self.max_value - self.min_value) * \
            tf.where(condition=deterministic, x=definite, y=sampled)

    def tf_log_probability(self, distr_params, action):
        alpha, beta, _, log_norm = distr_params
        action = (action - self.min_value) / (self.max_value - self.min_value)
        action = tf.minimum(x=action, y=(1.0 - util.epsilon))
        return (beta - 1.0) * tf.log(x=tf.maximum(x=action, y=util.epsilon)) + \
            (alpha - 1.0) * tf.log1p(x=-action) - log_norm

    def tf_entropy(self, distr_params):
        alpha, beta, alpha_beta, log_norm = distr_params
        return log_norm - (beta - 1.0) * tf.digamma(x=beta) - (alpha - 1.0) * tf.digamma(x=alpha) + \
            (alpha_beta - 2.0) * tf.digamma(x=alpha_beta)

    def tf_kl_divergence(self, distr_params1, distr_params2):
        alpha1, beta1, alpha_beta1, log_norm1 = distr_params1
        alpha2, beta2, alpha_beta2, log_norm2 = distr_params2
        return log_norm2 - log_norm1 - tf.digamma(x=beta1) * (beta2 - beta1) - \
            tf.digamma(x=alpha1) * (alpha2 - alpha1) + tf.digamma(x=alpha_beta1) * (alpha_beta2 - alpha_beta1)

    def tf_regularization_loss(self):
        regularization_loss = super(Beta, self).tf_regularization_loss()
        if regularization_loss is None:
            losses = list()
        else:
            losses = [regularization_loss]

        regularization_loss = self.alpha.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        regularization_loss = self.beta.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        if len(losses) > 0:
            return tf.add_n(inputs=losses)
        else:
            return None

    def get_variables(self, include_non_trainable=False):
        distribution_variables = super(
            Beta,
            self).get_variables(include_non_trainable=include_non_trainable)
        alpha_variables = self.alpha.get_variables(
            include_non_trainable=include_non_trainable)
        beta_variables = self.beta.get_variables(
            include_non_trainable=include_non_trainable)

        return distribution_variables + alpha_variables + beta_variables

    def get_summaries(self):
        distribution_summaries = super(Beta, self).get_summaries()
        alpha_summaries = self.alpha.get_summaries()
        beta_summaries = self.beta.get_summaries()

        return distribution_summaries + alpha_summaries + beta_summaries
示例#3
0
class NetworkBaseline(Baseline):
    """
    Baseline based on a TensorForce network, used when parameters are shared between
    the value function and the baseline.
    """
    def __init__(self,
                 network_spec,
                 scope='network-baseline',
                 summary_labels=()):
        """
        Network baseline.

        Args:
            network_spec: Network specification dict
        """
        self.network = Network.from_spec(
            spec=network_spec, kwargs=dict(summary_labels=summary_labels))
        assert len(self.network.internals_input()) == 0

        self.linear = Linear(size=1, bias=0.0, scope='prediction')

        super(NetworkBaseline, self).__init__(scope, summary_labels)

    def tf_predict(self, states, update):
        embedding = self.network.apply(x=states, internals=(), update=update)
        prediction = self.linear.apply(x=embedding)
        return tf.squeeze(input=prediction, axis=1)

    def tf_regularization_loss(self):
        """
        Creates the TensorFlow operations for the baseline regularization loss.

        Returns:
            Regularization loss tensor
        """
        regularization_loss = super(NetworkBaseline,
                                    self).tf_regularization_loss()
        if regularization_loss is None:
            losses = list()
        else:
            losses = [regularization_loss]

        regularization_loss = self.network.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        regularization_loss = self.linear.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        if len(losses) > 0:
            return tf.add_n(inputs=losses)
        else:
            return None

    def get_variables(self, include_non_trainable=False):
        baseline_variables = super(
            NetworkBaseline,
            self).get_variables(include_non_trainable=include_non_trainable)
        network_variables = self.network.get_variables(
            include_non_trainable=include_non_trainable)
        layer_variables = self.linear.get_variables(
            include_non_trainable=include_non_trainable)

        return baseline_variables + network_variables + layer_variables

    def get_summaries(self):
        baseline_summaries = super(NetworkBaseline, self).get_summaries()
        network_summaries = self.network.get_summaries()
        layer_summaries = self.linear.get_summaries()

        return baseline_summaries + network_summaries + layer_summaries
示例#4
0
class Bernoulli(Distribution):
    """
    Bernoulli distribution for binary actions.
    """
    def __init__(self,
                 shape,
                 probability=0.5,
                 scope='bernoulli',
                 summary_labels=()):
        self.shape = shape
        action_size = util.prod(self.shape)

        with tf.name_scope(name=scope):
            self.logit = Linear(size=action_size,
                                bias=log(probability),
                                scope='logit')

        super(Bernoulli, self).__init__(scope, summary_labels)

    def tf_parameterize(self, x):
        # Flat logit
        logit = self.logit.apply(x=x)

        # Reshape logit to action shape
        shape = (-1, ) + self.shape
        logit = tf.reshape(tensor=logit, shape=shape)

        #TODO rename
        state_value = logit

        # Sigmoid for corresponding probability
        probability = tf.sigmoid(x=logit)

        # Min epsilon probability for numerical stability
        probability = tf.clip_by_value(t=probability,
                                       clip_value_min=util.epsilon,
                                       clip_value_max=(1.0 - util.epsilon))

        # "Normalized" logits
        true_logit = tf.log(x=probability)
        false_logit = tf.log(x=(1.0 - probability))

        return true_logit, false_logit, probability, state_value

    def state_value(self, distr_params):
        _, _, _, state_value = distr_params
        return state_value

    def state_action_value(self, distr_params, action):
        true_logit, false_logit, _, state_value = distr_params
        logit = tf.where(condition=action, x=true_logit, y=false_logit)
        return logit + state_value

    def tf_sample(self, distr_params, deterministic):
        _, _, probability, _ = distr_params

        # Deterministic: true if >= 0.5
        definite = tf.greater_equal(x=probability, y=0.5)

        # Non-deterministic: sample true if >= uniform distribution
        uniform = tf.random_uniform(shape=tf.shape(probability))
        sampled = tf.greater_equal(x=probability, y=uniform)

        return tf.where(condition=deterministic, x=definite, y=sampled)

    def tf_log_probability(self, distr_params, action):
        true_logit, false_logit, _, _ = distr_params
        return tf.where(condition=action, x=true_logit, y=false_logit)

    def tf_entropy(self, distr_params):
        true_logit, false_logit, probability, _ = distr_params
        return -probability * true_logit - (1.0 - probability) * false_logit

    def tf_kl_divergence(self, distr_params1, distr_params2):
        true_logit1, false_logit1, probability1, _ = distr_params1
        true_logit2, false_logit2, _, _ = distr_params2
        true_log_prob_ratio = true_logit1 - true_logit2
        false_log_prob_ratio = false_logit1 - false_logit2
        return probability1 * true_log_prob_ratio + (
            1.0 - probability1) * false_log_prob_ratio

    def tf_regularization_loss(self):
        regularization_loss = super(Bernoulli, self).tf_regularization_loss()
        if super(Bernoulli, self).tf_regularization_loss() is None:
            losses = list()
        else:
            losses = [regularization_loss]

        regularization_loss = self.logit.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        if len(losses) > 0:
            return tf.add_n(inputs=losses)
        else:
            return None

    def get_variables(self, include_non_trainable=False):
        distribution_variables = super(
            Bernoulli,
            self).get_variables(include_non_trainable=include_non_trainable)
        logit_variables = self.logit.get_variables(
            include_non_trainable=include_non_trainable)

        return distribution_variables + logit_variables

    def get_summaries(self):
        distribution_summaries = super(Bernoulli, self).get_summaries()
        logit_summaries = self.logit.get_summaries()

        return distribution_summaries + logit_summaries
示例#5
0
class Gaussian(Distribution):
    """
    Gaussian distribution, for unbounded continuous actions.
    """

    def __init__(self, shape, mean=0.0, log_stddev=0.0, scope='gaussian', summary_labels=()):
        """
        Categorical distribution.

        Args:
            shape: Action shape.
            mean: Optional distribution bias for the mean.
            log_stddev: Optional distribution bias for the standard deviation.
        """
        self.shape = shape
        action_size = util.prod(self.shape)

        self.mean = Linear(size=action_size, bias=mean, scope='mean')
        self.log_stddev = Linear(size=action_size, bias=log_stddev, scope='log-stddev')

        super(Gaussian, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)

    def tf_parameterize(self, x):
        # Flat mean and log standard deviation
        mean = self.mean.apply(x=x)
        log_stddev = self.log_stddev.apply(x=x)

        # Reshape mean and log stddev to action shape
        shape = (-1,) + self.shape
        mean = tf.reshape(tensor=mean, shape=shape)
        log_stddev = tf.reshape(tensor=log_stddev, shape=shape)

        # Clip log stddev for numerical stability
        log_eps = log(util.epsilon)  # epsilon < 1.0, hence negative
        log_stddev = tf.clip_by_value(t=log_stddev, clip_value_min=log_eps, clip_value_max=-log_eps)

        # Standard deviation
        stddev = tf.exp(x=log_stddev)

        return mean, stddev, log_stddev

    def state_value(self, distr_params):
        _, _, log_stddev = distr_params
        return -log_stddev - 0.5 * log(2.0 * pi)

    def state_action_value(self, distr_params, action):
        mean, stddev, log_stddev = distr_params
        sq_mean_distance = tf.square(x=(action - mean))
        sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon)
        return -0.5 * sq_mean_distance / sq_stddev - 2.0 * log_stddev - log(2.0 * pi)

    def tf_sample(self, distr_params, deterministic):
        mean, stddev, _ = distr_params

        # Deterministic: mean as action
        definite = mean

        # Non-deterministic: sample action using default normal distribution
        normal_distribution = tf.random_normal(shape=tf.shape(input=mean))
        sampled = mean + stddev * normal_distribution

        return tf.where(condition=deterministic, x=definite, y=sampled)

    def tf_log_probability(self, distr_params, action):
        mean, stddev, log_stddev = distr_params
        sq_mean_distance = tf.square(x=(action - mean))
        sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon)
        return -0.5 * sq_mean_distance / sq_stddev - log_stddev - 0.5 * log(2.0 * pi)

    def tf_entropy(self, distr_params):
        _, _, log_stddev = distr_params
        return log_stddev + 0.5 * log(2.0 * pi * e)

    def tf_kl_divergence(self, distr_params1, distr_params2):
        mean1, stddev1, log_stddev1 = distr_params1
        mean2, stddev2, log_stddev2 = distr_params2

        log_stddev_ratio = log_stddev2 - log_stddev1
        sq_mean_distance = tf.square(x=(mean1 - mean2))
        sq_stddev1 = tf.square(x=stddev1)
        sq_stddev2 = tf.maximum(x=tf.square(x=stddev2), y=util.epsilon)

        return log_stddev_ratio + 0.5 * (sq_stddev1 + sq_mean_distance) / sq_stddev2 - 0.5

    def tf_regularization_loss(self):
        regularization_loss = super(Gaussian, self).tf_regularization_loss()
        if regularization_loss is None:
            losses = list()
        else:
            losses = [regularization_loss]

        regularization_loss = self.mean.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        regularization_loss = self.log_stddev.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        if len(losses) > 0:
            return tf.add_n(inputs=losses)
        else:
            return None

    def get_variables(self, include_nontrainable=False):
        distribution_variables = super(Gaussian, self).get_variables(include_nontrainable=include_nontrainable)
        mean_variables = self.mean.get_variables(include_nontrainable=include_nontrainable)
        log_stddev_variables = self.log_stddev.get_variables(include_nontrainable=include_nontrainable)

        return distribution_variables + mean_variables + log_stddev_variables

    def get_summaries(self):
        distribution_summaries = super(Gaussian, self).get_summaries()
        mean_summaries = self.mean.get_summaries()
        log_stddev_summaries = self.log_stddev.get_summaries()

        return distribution_summaries + mean_summaries + log_stddev_summaries
class AggregatedBaseline(Baseline):
    """
    Baseline which aggregates per-state baselines.
    """
    def __init__(self,
                 baselines,
                 scope='aggregated-baseline',
                 summary_labels=()):
        """
        Aggregated baseline.

        Args:
            baselines: Dict of per-state baseline specification dicts
        """

        with tf.name_scope(name=scope):
            self.baselines = dict()
            for name, baseline_spec in baselines.items():
                with tf.name_scope(name=(name + '-baseline')):
                    self.baselines[name] = Baseline.from_spec(
                        spec=baseline_spec,
                        kwargs=dict(summary_labels=summary_labels))

            self.linear = Linear(size=1, bias=0.0, scope='prediction')

        super(AggregatedBaseline, self).__init__(scope, summary_labels)

    def tf_predict(self, states, update):
        predictions = list()
        for name, state in states.items():
            prediction = self.baselines[name].predict(states=state,
                                                      update=update)
            predictions.append(prediction)
        predictions = tf.stack(values=predictions, axis=1)
        prediction = self.linear.apply(x=predictions)
        return tf.squeeze(input=prediction, axis=1)

    def tf_regularization_loss(self):
        regularization_loss = super(AggregatedBaseline,
                                    self).tf_regularization_loss()
        if regularization_loss is None:
            losses = list()
        else:
            losses = [regularization_loss]

        for baseline in self.baselines.values():
            regularization_loss = baseline.regularization_loss()
            if regularization_loss is not None:
                losses.append(regularization_loss)

        regularization_loss = self.linear.regularization_loss()
        if regularization_loss is not None:
            losses.append(regularization_loss)

        if len(losses) > 0:
            return tf.add_n(inputs=losses)
        else:
            return None

    def get_variables(self, include_non_trainable=False):
        baseline_variables = super(
            AggregatedBaseline,
            self).get_variables(include_non_trainable=include_non_trainable)
        baselines_variables = [
            variable for name in sorted(self.baselines)
            for variable in self.baselines[name].get_variables(
                include_non_trainable=include_non_trainable)
        ]
        linear_variables = self.linear.get_variables(
            include_non_trainable=include_non_trainable)

        return baseline_variables + baselines_variables + linear_variables

    def get_summaries(self):
        baseline_summaries = super(AggregatedBaseline, self).get_summaries()
        baselines_summaries = [
            variable for name in sorted(self.baselines)
            for variable in self.baselines[name].get_summaries()
        ]
        linear_summaries = self.linear.get_summaries()

        return baseline_summaries + baselines_summaries + linear_summaries