예제 #1
0
    def parametrize(self, *, x, conditions):
        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        shape = (-1, ) + self.action_spec.shape

        # Logit
        logit = self.logit.apply(x=x)
        if len(self.input_spec.shape) == 1:
            logit = tf.reshape(tensor=logit, shape=shape)

        # States value
        state_value = logit

        # Sigmoid for corresponding probability
        probability = tf.sigmoid(x=logit)

        # Clip probability for numerical stability
        probability = tf.clip_by_value(t=probability,
                                       clip_value_min=epsilon,
                                       clip_value_max=(one - epsilon))

        # "Normalized" logits
        true_logit = tf.math.log(x=probability)
        false_logit = tf.math.log(x=(one - probability))

        return TensorDict(true_logit=true_logit,
                          false_logit=false_logit,
                          probability=probability,
                          state_value=state_value)
예제 #2
0
    def mode(self, *, parameters):
        alpha, beta, alpha_beta = parameters.get(
            ('alpha', 'beta', 'alpha_beta'))

        # Distribution parameter tracking
        def fn_tracking():
            return tf.math.reduce_mean(input_tensor=alpha, axis=0)

        dependencies = self.track(label='distribution',
                                  name='alpha',
                                  data=fn_tracking)

        def fn_tracking():
            return tf.math.reduce_mean(input_tensor=beta, axis=0)

        dependencies.extend(
            self.track(label='distribution', name='beta', data=fn_tracking))

        with tf.control_dependencies(control_inputs=dependencies):
            action = beta / alpha_beta

            min_value = tf_util.constant(value=self.action_spec.min_value,
                                         dtype='float')
            max_value = tf_util.constant(value=self.action_spec.max_value,
                                         dtype='float')

            return min_value + (max_value - min_value) * action
예제 #3
0
    def start(self, *, arguments, x_init, base_value, zero_value, estimated):
        """
        Initialization step preparing the arguments for the first iteration of the loop body.

        Args:
            x_init: Initial solution guess $x_0$.
            base_value: Value $f(x')$ at $x = x'$.
            zero_value: Value $f(x_0)$ at $x = x_0$.
            estimated: Estimated value at $x = x_0$.

        Returns:
            Initial arguments for step.
        """

        dependencies = list()
        if self.config.create_tf_assertions:
            zero_float = tf_util.constant(value=0.0, dtype='float')
            dependencies.append(tf.debugging.assert_greater_equal(x=estimated, y=zero_float))

        with tf.control_dependencies(control_inputs=dependencies):
            zeros_x = x_init.fmap(function=tf.zeros_like)

            improvement = zero_value - base_value
            last_improvement = tf_util.constant(value=-1.0, dtype='float')

        return arguments, zeros_x, x_init, improvement, last_improvement, base_value, estimated
예제 #4
0
    def retrieve_timesteps(self, *, n, past_horizon, future_horizon):
        one = tf_util.constant(value=1, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')

        # Check whether memory contains at least one valid timestep
        num_timesteps = tf.math.minimum(x=self.buffer_index, y=capacity)
        num_timesteps -= (past_horizon + future_horizon)
        num_timesteps = tf.math.maximum(x=num_timesteps, y=self.episode_count)

        # Check whether memory contains at least one timestep
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.append(
                tf.debugging.assert_greater_equal(x=num_timesteps, y=one))

        # Randomly sampled timestep indices
        with tf.control_dependencies(control_inputs=assertions):
            n = tf.math.minimum(x=n, y=num_timesteps)
            indices = tf.random.uniform(shape=(n, ),
                                        maxval=num_timesteps,
                                        dtype=tf_util.get_dtype(type='int'))
            indices = tf.math.mod(x=(self.buffer_index - one - indices -
                                     future_horizon),
                                  y=capacity)

        return indices
예제 #5
0
    def retrieve_episodes(self, *, n):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')

        # Check whether memory contains at least one episode
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.append(
                tf.debugging.assert_greater_equal(x=self.episode_count, y=one))

        # Get start and limit indices for randomly sampled n episodes
        with tf.control_dependencies(control_inputs=assertions):
            n = tf.math.minimum(x=n, y=self.episode_count)
            random_indices = tf.random.uniform(
                shape=(n, ),
                maxval=self.episode_count,
                dtype=tf_util.get_dtype(type='int'))

            # (Increment terminal of previous episode)
            starts = tf.gather(params=self.terminal_indices,
                               indices=random_indices) + one
            limits = tf.gather(params=self.terminal_indices,
                               indices=(random_indices + one)) + one

            # Correct limit index if smaller than start index
            limits = limits + tf.where(
                condition=(limits < starts), x=capacity, y=zero)

            # Random episode indices ranges
            indices = tf.ragged.range(starts=starts, limits=limits).values
            indices = tf.math.mod(x=indices, y=capacity)

        return indices
예제 #6
0
    def sample(self, *, parameters, temperature):
        logits, probabilities, action_values = parameters.get(
            ('logits', 'probabilities', 'action_values'))

        # Distribution parameter summaries
        def fn_summary():
            axis = range(self.action_spec.rank + 1)
            probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis)
            return [probs[n] for n in range(self.action_spec.num_values)]

        prefix = 'distributions/' + self.name + '-probability'
        names = [prefix + str(n) for n in range(self.action_spec.num_values)]
        dependencies = self.summary(label='distribution',
                                    name=names,
                                    data=fn_summary,
                                    step='timesteps')

        # Entropy summary
        def fn_summary():
            entropy = -tf.reduce_sum(input_tensor=(probabilities * logits),
                                     axis=-1)
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy',
                         name=name,
                         data=fn_summary,
                         step='timesteps'))

        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        # Deterministic: maximum likelihood action
        definite = tf.argmax(input=action_values, axis=-1)
        definite = tf_util.cast(x=definite, dtype='int')

        # Set logits to minimal value
        min_float = tf.fill(dims=tf.shape(input=logits),
                            value=tf_util.get_dtype(type='float').min)
        logits = logits / temperature
        logits = tf.where(condition=(probabilities < epsilon),
                          x=min_float,
                          y=logits)

        # Non-deterministic: sample action using Gumbel distribution
        uniform_distribution = tf.random.uniform(
            shape=tf.shape(input=logits),
            minval=epsilon,
            maxval=(one - epsilon),
            dtype=tf_util.get_dtype(type='float'))
        gumbel_distribution = -tf.math.log(
            x=-tf.math.log(x=uniform_distribution))
        sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1)
        sampled = tf_util.cast(x=sampled, dtype='int')

        with tf.control_dependencies(control_inputs=dependencies):
            return tf.where(condition=(temperature < epsilon),
                            x=definite,
                            y=sampled)
예제 #7
0
    def parametrize(self, *, x, conditions):
        log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float')
        shape = (-1,) + self.action_spec.shape

        # Mean
        mean = self.mean.apply(x=x)
        if len(self.input_spec.shape) == 1:
            mean = tf.reshape(tensor=mean, shape=shape)

        # Log standard deviation
        if self.global_stddev:
            multiples = (tf.shape(input=x)[0],) + tuple(1 for _ in range(self.action_spec.rank))
            log_stddev = tf.tile(input=self.log_stddev, multiples=multiples)
        else:
            log_stddev = self.log_stddev.apply(x=x)
            if len(self.input_spec.shape) == 1:
                log_stddev = tf.reshape(tensor=log_stddev, shape=shape)

        # Shift log stddev to reduce zero value (TODO: 0.1 random choice)
        if self.action_spec.min_value is not None and self.action_spec.max_value is not None:
            log_stddev += tf_util.constant(value=np.log(0.1), dtype='float')

        # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative)
        log_stddev = tf.clip_by_value(
            t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon
        )

        # Standard deviation
        stddev = tf.math.exp(x=log_stddev)

        return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
예제 #8
0
    def parametrize(self, *, x, conditions):
        # Softplus to ensure alpha and beta >= 1
        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float')
        shape = (-1,) + self.action_spec.shape

        # Alpha
        alpha = self.alpha.apply(x=x)
        # epsilon < 1.0, hence negative
        alpha = tf.clip_by_value(t=alpha, clip_value_min=log_epsilon, clip_value_max=-log_epsilon)
        alpha = tf.math.softplus(features=alpha) + one
        if len(self.input_spec.shape) == 1:
            alpha = tf.reshape(tensor=alpha, shape=shape)

        # Beta
        beta = self.beta.apply(x=x)
        # epsilon < 1.0, hence negative
        beta = tf.clip_by_value(t=beta, clip_value_min=log_epsilon, clip_value_max=-log_epsilon)
        beta = tf.math.softplus(features=beta) + one
        if len(self.input_spec.shape) == 1:
            beta = tf.reshape(tensor=beta, shape=shape)

        # Alpha + Beta
        alpha_beta = tf.maximum(x=(alpha + beta), y=epsilon)

        # Log norm
        log_norm = tf.math.lgamma(x=alpha) + tf.math.lgamma(x=beta) - tf.math.lgamma(x=alpha_beta)

        return TensorDict(alpha=alpha, beta=beta, alpha_beta=alpha_beta, log_norm=log_norm)
예제 #9
0
    def next_step(
        self, *, arguments, x, deltas, improvement, last_improvement, base_value, estimated
    ):
        """
        Termination condition: max number of iterations, or no improvement for last step, or
        improvement less than acceptable ratio, or estimated value not positive.

        Args:
            x: Current solution estimate $x_{t-1}$.
            deltas: Current difference $x_t - x_{t-1}$.
            improvement: Current improvement $(f(x_t) - f(x'))$.
            last_improvement: Last improvement $(f(x_{t-1}) - f(x'))$.
            base_value: Value $f(x')$ at $x = x'$.
            estimated: Current estimated value at $x_t$.

        Returns:
            True if another iteration should be performed.
        """
        # Continue while current step is an improvement over last step
        zero_float = tf_util.constant(value=0.0, dtype='float')
        last_improvement = tf.math.maximum(x=last_improvement, y=zero_float)
        next_step = (improvement >= last_improvement)
        # Continue while estimated improvement is positive
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        next_step = tf.math.logical_and(x=next_step, y=(estimated > epsilon))
        # Continue while improvement ratio is below accept ratio, so not yet sufficient
        accept_ratio = self.accept_ratio.value()
        improvement_ratio = improvement / tf.math.maximum(x=estimated, y=epsilon)
        return tf.math.logical_and(x=next_step, y=(improvement_ratio < accept_ratio))
예제 #10
0
    def retrieve_episodes(self, *, n):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')

        # Check whether memory contains at least one episode
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.append(
                tf.debugging.assert_greater_equal(x=self.episode_count, y=one))

        # Get start and limit index for most recent n episodes
        with tf.control_dependencies(control_inputs=assertions):
            n = tf.math.minimum(x=n, y=self.episode_count)

            # (Increment terminal of previous episode)
            start = self.terminal_indices[self.episode_count - n] + one
            limit = self.terminal_indices[self.episode_count] + one

            # Correct limit index if smaller than start index
            limit = limit + tf.where(
                condition=(limit < start), x=capacity, y=zero)

            # Most recent episode indices range
            indices = tf.range(start=start, limit=limit)
            indices = tf.math.mod(x=indices, y=capacity)

        return indices
예제 #11
0
    def mode(self, *, parameters):
        beta, alpha_beta = parameters.get(('beta', 'alpha_beta'))

        action = beta / alpha_beta

        min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float')
        max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float')

        return min_value + (max_value - min_value) * action
예제 #12
0
    def action_value(self, *, parameters, action):
        mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev'))

        # Inverse bounded transformation
        if self.bounded_transform is not None:
            if self.action_spec.min_value is not None and self.action_spec.max_value is not None:
                one = tf_util.constant(value=1.0, dtype='float')
                two = tf_util.constant(value=2.0, dtype='float')
                min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float')
                max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float')
                action = two * (action - min_value) / (max_value - min_value) - one

            if self.bounded_transform == 'tanh':
                clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float')
                action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip)
                action = tf.math.atanh(x=action)

        half = tf_util.constant(value=0.5, dtype='float')
        two = tf_util.constant(value=2.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        log_two_pi = tf_util.constant(value=(np.log(2.0 * np.pi)), dtype='float')
        # TODO: why no e here, but for entropy?

        sq_mean_distance = tf.square(x=(action - mean))
        sq_stddev = tf.maximum(x=tf.square(x=stddev), y=epsilon)

        action_value = -half * sq_mean_distance / sq_stddev - two * log_stddev - log_two_pi

        # Probably not needed?
        # if self.bounded_transform == 'tanh':
        #     log_two = tf_util.constant(value=np.log(2.0), dtype='float')
        #     action_value -= two * (log_two - action - tf.math.softplus(features=(-two * action)))

        return action_value
예제 #13
0
    def sample(self, *, parameters, temperature):
        alpha, beta, alpha_beta, log_norm = parameters.get(
            ('alpha', 'beta', 'alpha_beta', 'log_norm'))

        # Distribution parameter summaries
        def fn_summary():
            return tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)), \
                tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1))

        prefix = 'distributions/' + self.name
        names = (prefix + '-alpha', prefix + '-beta')
        dependencies = self.summary(label='distribution',
                                    name=names,
                                    data=fn_summary,
                                    step='timesteps')

        # Distribution parameter tracking
        def fn_tracking():
            return tf.math.reduce_mean(input_tensor=alpha, axis=0)

        dependencies.extend(
            self.track(label='distribution', name='alpha', data=fn_tracking))

        def fn_tracking():
            return tf.math.reduce_mean(input_tensor=beta, axis=0)

        dependencies.extend(
            self.track(label='distribution', name='beta', data=fn_tracking))

        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        def fn_mode():
            # Deterministic: mean as action
            return beta / alpha_beta

        def fn_sample():
            # Non-deterministic: sample action using gamma distribution
            alpha_sample = tf.random.gamma(
                shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float'))
            beta_sample = tf.random.gamma(
                shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float'))
            return beta_sample / tf.maximum(x=(alpha_sample + beta_sample),
                                            y=epsilon)

        action = tf.cond(pred=(temperature < epsilon),
                         true_fn=fn_mode,
                         false_fn=fn_sample)

        min_value = tf_util.constant(value=self.action_spec.min_value,
                                     dtype='float')
        max_value = tf_util.constant(value=self.action_spec.max_value,
                                     dtype='float')

        with tf.control_dependencies(control_inputs=dependencies):
            return min_value + (max_value - min_value) * action
예제 #14
0
    def log_probability(self, *, parameters, action):
        mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev'))

        # Inverse bounded transformation
        if self.bounded_transform is not None:
            if self.action_spec.min_value is not None and self.action_spec.max_value is not None:
                one = tf_util.constant(value=1.0, dtype='float')
                two = tf_util.constant(value=2.0, dtype='float')
                min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float')
                max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float')
                action = two * (action - min_value) / (max_value - min_value) - one

            if self.bounded_transform == 'tanh':
                clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float')
                action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip)
                action = tf_util.cast(x=tf.math.atanh(x=tf_util.float32(x=action)), dtype='float')

        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        half = tf_util.constant(value=0.5, dtype='float')
        half_log_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float')

        sq_mean_distance = tf.square(x=(action - mean))
        sq_stddev = tf.maximum(x=tf.square(x=stddev), y=epsilon)

        log_prob = -half * sq_mean_distance / sq_stddev - log_stddev - half_log_two_pi

        if self.bounded_transform == 'tanh':
            log_two = tf_util.constant(value=np.log(2.0), dtype='float')
            log_prob -= two * (log_two - action - tf.math.softplus(features=(-two * action)))

        return log_prob
예제 #15
0
    def reset(self):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        three = tf_util.constant(value=3, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')
        last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity)

        def correct_terminal():
            # Replace last observation terminal marker with abort terminal
            dependencies = list()
            two = tf_util.constant(value=2, dtype='int')
            sparse_delta = tf.IndexedSlices(values=two, indices=last_index)
            dependencies.append(self.buffers['terminal'].scatter_update(
                sparse_delta=sparse_delta))
            sparse_delta = tf.IndexedSlices(values=last_index,
                                            indices=(self.episode_count + one))
            dependencies.append(
                self.terminal_indices.scatter_update(
                    sparse_delta=sparse_delta))
            with tf.control_dependencies(control_inputs=dependencies):
                return self.episode_count.assign_add(delta=one,
                                                     read_value=False)

        last_terminal = tf.gather(params=self.buffers['terminal'],
                                  indices=last_index)
        is_incorrect = tf.math.equal(x=last_terminal, y=three)
        corrected = tf.cond(pred=is_incorrect,
                            true_fn=correct_terminal,
                            false_fn=tf.no_op)

        with tf.control_dependencies(control_inputs=(corrected, )):
            assertions = [corrected]
            if self.config.create_tf_assertions:
                # general check: all terminal indices true
                assertions.append(
                    tf.debugging.assert_equal(
                        x=tf.reduce_all(input_tensor=tf.gather(
                            params=tf.math.greater(x=self.buffers['terminal'],
                                                   y=zero),
                            indices=self.terminal_indices[:self.episode_count +
                                                          one])),
                        y=tf_util.constant(value=True, dtype='bool'),
                        message="Memory consistency check."))
                # general check: only terminal indices true
                assertions.append(
                    tf.debugging.assert_equal(
                        x=tf.math.count_nonzero(
                            input=self.buffers['terminal'],
                            dtype=tf_util.get_dtype(type='int')),
                        y=(self.episode_count + one),
                        message="Memory consistency check."))

        with tf.control_dependencies(control_inputs=assertions):
            return one < zero
예제 #16
0
    def apply(self, *, x):
        is_inf = np.logical_or(np.isinf(self.min_value),
                               np.isinf(self.max_value))
        is_inf = tf_util.constant(value=is_inf, dtype='bool')
        min_value = tf_util.constant(value=self.min_value, dtype='float')
        max_value = tf_util.constant(value=self.max_value, dtype='float')

        return tf.where(condition=is_inf,
                        x=x,
                        y=(4.0 * (x - min_value) / (max_value - min_value) -
                           2.0))
예제 #17
0
    def sample(self, *, parameters, temperature):
        alpha, beta, alpha_beta, log_norm = parameters.get(
            ('alpha', 'beta', 'alpha_beta', 'log_norm')
        )

        # Distribution parameter summaries
        def fn_summary():
            return tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)), \
                tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1))

        prefix = 'distributions/' + self.name
        dependencies = self.summary(
            label='distribution', name=(prefix + '-alpha', prefix + '-beta'), data=fn_summary,
            step='timesteps'
        )

        # Entropy summary
        def fn_summary():
            one = tf_util.constant(value=1.0, dtype='float')
            digamma_alpha = tf_util.cast(
                x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float'
            )
            digamma_beta = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float')
            digamma_alpha_beta = tf_util.cast(
                x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float'
            )
            entropy = log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \
                (alpha_beta - one - one) * digamma_alpha_beta
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')
        )

        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        # Deterministic: mean as action
        definite = beta / alpha_beta

        # Non-deterministic: sample action using gamma distribution
        alpha_sample = tf.random.gamma(shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float'))
        beta_sample = tf.random.gamma(shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float'))

        sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=epsilon)

        action = tf.where(condition=(temperature < epsilon), x=definite, y=sampled)

        min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float')
        max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float')

        with tf.control_dependencies(control_inputs=dependencies):
            return min_value + (max_value - min_value) * action
예제 #18
0
    def apply(self, *, x, independent):
        dependencies = list()

        if independent:
            mean = self.moving_mean
            variance = self.moving_variance

        else:
            one = tf_util.constant(value=1.0, dtype='float')
            axes = (0, ) + tuple(1 + axis for axis in self.axes)

            decay = self.decay.value()
            batch_size = tf_util.cast(x=tf.shape(input=x)[0], dtype='float')
            decay = tf.math.pow(x=decay, y=batch_size)
            condition = tf.math.logical_or(x=self.after_first_call,
                                           y=tf.math.equal(x=batch_size, y=0))

            mean = tf.math.reduce_mean(input_tensor=x,
                                       axis=axes,
                                       keepdims=True)
            mean = tf.where(condition=condition,
                            x=(decay * self.moving_mean +
                               (one - decay) * mean),
                            y=mean)

            variance = tf.reduce_mean(input_tensor=tf.math.squared_difference(
                x=x, y=mean),
                                      axis=axes,
                                      keepdims=True)
            variance = tf.where(condition=condition,
                                x=(decay * self.moving_variance +
                                   (one - decay) * variance),
                                y=variance)

            with tf.control_dependencies(control_inputs=(mean, variance)):
                value = tf.math.logical_or(x=self.after_first_call,
                                           y=(batch_size > 0))
                dependencies.append(
                    self.after_first_call.assign(value=value,
                                                 read_value=False))

            mean = self.moving_mean.assign(value=mean)
            variance = self.moving_variance.assign(value=variance)

        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=epsilon))

        with tf.control_dependencies(control_inputs=dependencies):
            x = (x - tf.stop_gradient(input=mean)) * tf.stop_gradient(
                input=reciprocal_stddev)

        return x
예제 #19
0
    def kl_divergence(self, *, parameters1, parameters2):
        mean1, stddev1, log_stddev1 = parameters1.get(('mean', 'stddev', 'log_stddev'))
        mean2, stddev2, log_stddev2 = parameters2.get(('mean', 'stddev', 'log_stddev'))

        half = tf_util.constant(value=0.5, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        log_stddev_ratio = log_stddev2 - log_stddev1
        sq_mean_distance = tf.square(x=(mean1 - mean2))
        sq_stddev1 = tf.square(x=stddev1)
        sq_stddev2 = tf.maximum(x=tf.square(x=stddev2), y=epsilon)

        return log_stddev_ratio + half * (sq_stddev1 + sq_mean_distance) / sq_stddev2 - half
예제 #20
0
    def loss(self, *, states, horizons, internals, auxiliaries, actions,
             reward, policy, reference):
        if not self.early_reduce:
            reward = tf.expand_dims(input=reward, axis=1)

        if self.value == 'state':
            value = policy.states_value(states=states,
                                        horizons=horizons,
                                        internals=internals,
                                        auxiliaries=auxiliaries,
                                        reduced=self.early_reduce,
                                        return_per_action=False)
        elif self.value == 'action':
            value = policy.actions_value(states=states,
                                         horizons=horizons,
                                         internals=internals,
                                         auxiliaries=auxiliaries,
                                         actions=actions,
                                         reduced=self.early_reduce,
                                         return_per_action=False)

        difference = value - reward

        zero = tf_util.constant(value=0.0, dtype='float')
        half = tf_util.constant(value=0.5, dtype='float')

        huber_loss = self.huber_loss.value()
        skip_huber_loss = tf.math.equal(x=huber_loss, y=zero)

        def no_huber_loss():
            return half * tf.math.square(x=difference)

        def apply_huber_loss():
            inside_huber_bounds = tf.math.less_equal(
                x=tf.math.abs(x=difference), y=huber_loss)
            quadratic = half * tf.math.square(x=difference)
            linear = huber_loss * (tf.math.abs(x=difference) -
                                   half * huber_loss)
            return tf.where(condition=inside_huber_bounds,
                            x=quadratic,
                            y=linear)

        loss = tf.cond(pred=skip_huber_loss,
                       true_fn=no_huber_loss,
                       false_fn=apply_huber_loss)

        if not self.early_reduce:
            loss = tf.math.reduce_mean(input_tensor=loss, axis=1)

        return loss
예제 #21
0
    def log_probability(self, *, parameters, action):
        alpha, beta, log_norm = parameters.get(('alpha', 'beta', 'log_norm'))

        min_value = tf_util.constant(value=self.action_spec.min_value,
                                     dtype='float')
        max_value = tf_util.constant(value=self.action_spec.max_value,
                                     dtype='float')

        action = (action - min_value) / (max_value - min_value)

        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        return tf.math.xlogy(x=(beta - one), y=(action + epsilon)) + \
            (alpha - one) * tf.math.log1p(x=(-action + epsilon)) - log_norm
예제 #22
0
    def parametrize(self, *, x, conditions):
        log_epsilon = tf_util.constant(value=np.log(util.epsilon),
                                       dtype='float')
        shape = (-1, ) + self.action_spec.shape

        # Mean
        mean = self.mean.apply(x=x)
        if len(self.input_spec.shape) == 1:
            mean = tf.reshape(tensor=mean, shape=shape)

        # Log standard deviation
        if self.global_stddev:
            log_stddev = self.log_stddev
        else:
            log_stddev = self.log_stddev.apply(x=x)
            if len(self.input_spec.shape) == 1:
                log_stddev = tf.reshape(tensor=log_stddev, shape=shape)

        # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative)
        log_stddev = tf.clip_by_value(t=log_stddev,
                                      clip_value_min=log_epsilon,
                                      clip_value_max=-log_epsilon)

        # Standard deviation
        stddev = tf.exp(x=log_stddev)

        return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
예제 #23
0
    def mode(self, *, parameters, independent):
        probability = parameters['probability']

        # Distribution parameter summaries
        dependencies = list()
        if not independent:

            def fn_summary():
                axis = range(self.action_spec.rank + 1)
                return tf.math.reduce_mean(input_tensor=probability, axis=axis)

            name = 'distributions/' + self.name + '-probability'
            dependencies.extend(
                self.summary(label='distribution',
                             name=name,
                             data=fn_summary,
                             step='timesteps'))

        # Distribution parameter tracking
        def fn_tracking():
            return tf.math.reduce_mean(input_tensor=probability, axis=0)

        dependencies.extend(
            self.track(label='distribution',
                       name='probability',
                       data=fn_tracking))

        with tf.control_dependencies(control_inputs=dependencies):
            return tf.greater_equal(x=probability,
                                    y=tf_util.constant(value=0.5,
                                                       dtype='float'))
예제 #24
0
    def sample(self, *, states, horizons, internals, auxiliaries, temperature,
               independent):
        deterministic = tf_util.constant(value=False, dtype='bool')
        embedding, internals = self.network.apply(x=states,
                                                  horizons=horizons,
                                                  internals=internals,
                                                  deterministic=deterministic,
                                                  independent=independent)

        def function(name, distribution, temp):
            conditions = auxiliaries.get(name, default=TensorDict())
            parameters = distribution.parametrize(x=embedding,
                                                  conditions=conditions)
            return distribution.sample(parameters=parameters, temperature=temp)

        if isinstance(self.temperature, dict):
            actions = self.distributions.fmap(function=function,
                                              cls=TensorDict,
                                              with_names=True,
                                              zip_values=(temperature, ))
        else:
            actions = self.distributions.fmap(function=partial(
                function, temp=temperature),
                                              cls=TensorDict,
                                              with_names=True)

        return actions, internals
예제 #25
0
            def fn_terminal():
                operations = list()

                # Reset internals
                def function(spec, initial):
                    return tf_util.constant(value=initial, dtype=spec.type)

                initials = self.internals_spec.fmap(
                    function=function, cls=TensorDict, zip_values=self.internals_init
                )
                for name, previous, initial in self.previous_internals.zip_items(initials):
                    sparse_delta = tf.IndexedSlices(values=initial, indices=parallel)
                    operations.append(previous.scatter_update(sparse_delta=sparse_delta))

                # Episode reward summaries (before episode reward reset / episodes increment)
                if self.summary_labels == 'all' or 'reward' in self.summary_labels:
                    with self.summarizer.as_default():
                        x = tf.gather(params=self.episode_reward, indices=parallel)
                        tf.summary.scalar(name='episode-reward', data=x, step=self.episodes)

                # Reset episode reward
                zero_float = tf_util.constant(value=0.0, dtype='float')
                sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel)
                operations.append(self.episode_reward.scatter_update(sparse_delta=sparse_delta))

                # Increment episodes counter
                operations.append(self.episodes.assign_add(delta=one, read_value=False))

                return tf.group(*operations)
예제 #26
0
    def regularize(self):
        zero = tf_util.constant(value=0.0, dtype='float')

        module = self
        while module.l2_regularization is None:
            module = module.parent

        if len(self.this_trainable_variables) == 0 or \
                module.l2_regularization.is_constant(value=0.0):
            regularization_loss = zero

        else:
            l2_regularization = module.l2_regularization.value()

            def no_l2_regularization():
                return zero

            def apply_l2_regularization():
                l2_variables = list()
                for variable in self.this_trainable_variables:
                    variable = tf_util.cast(x=variable, dtype='float')
                    l2_variables.append(
                        tf.reduce_sum(input_tensor=tf.square(x=variable)))
                return l2_regularization * tf.math.add_n(inputs=l2_variables)

            skip_l2_regularization = tf.math.equal(x=l2_regularization, y=zero)
            regularization_loss = tf.cond(pred=skip_l2_regularization,
                                          true_fn=no_l2_regularization,
                                          false_fn=apply_l2_regularization)

        for module in self.this_submodules:
            if isinstance(module, Module) and module.is_trainable:
                regularization_loss += module.regularize()

        return regularization_loss
예제 #27
0
    def entropy(self, *, parameters):
        true_logit, false_logit, probability = parameters.get(
            ('true_logit', 'false_logit', 'probability'))

        one = tf_util.constant(value=1.0, dtype='float')

        return -probability * true_logit - (one - probability) * false_logit
예제 #28
0
    def state_value(self, *, parameters):
        log_stddev = parameters['log_stddev']

        half_lg_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float')
        # TODO: why no e here, but for entropy?

        return -log_stddev - half_lg_two_pi
예제 #29
0
    def action_values(self, *, states, horizons, internals, auxiliaries,
                      actions):
        deterministic = tf_util.constant(value=True, dtype='bool')
        embedding, _ = self.network.apply(x=states,
                                          horizons=horizons,
                                          internals=internals,
                                          deterministic=deterministic,
                                          independent=True)
        if not isinstance(embedding, TensorDict):
            embedding = TensorDict(embedding=embedding)

        def function(name, distribution, action):
            if name is None:
                x = embedding.get('action-embedding', embedding['embedding'])
            else:
                x = embedding.get(name + '-embedding', embedding['embedding'])
            conditions = auxiliaries.get(name, default=TensorDict())
            parameters = distribution.parametrize(x=x, conditions=conditions)
            return distribution.action_value(parameters=parameters,
                                             action=action)

        return self.distributions.fmap(function=function,
                                       cls=TensorDict,
                                       with_names=True,
                                       zip_values=actions)
예제 #30
0
    def sample(self, *, parameters, temperature):
        true_logit, false_logit, probability = parameters.get(
            ('true_logit', 'false_logit', 'probability'))

        # Distribution parameter summaries
        def fn_summary():
            axis = range(self.action_spec.rank + 1)
            return tf.math.reduce_mean(input_tensor=probability, axis=axis)

        name = 'distributions/' + self.name + '-probability'
        dependencies = self.summary(label='distribution',
                                    name=name,
                                    data=fn_summary,
                                    step='timesteps')

        # Entropy summary
        def fn_summary():
            one = tf_util.constant(value=1.0, dtype='float')
            entropy = -probability * true_logit - (one -
                                                   probability) * false_logit
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy',
                         name=name,
                         data=fn_summary,
                         step='timesteps'))

        half = tf_util.constant(value=0.5, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        # Deterministic: true if >= 0.5
        definite = tf.greater_equal(x=probability, y=half)

        # Non-deterministic: sample true if >= uniform distribution
        e_true_logit = tf.math.exp(x=(true_logit / temperature))
        e_false_logit = tf.math.exp(x=(false_logit / temperature))
        probability = e_true_logit / (e_true_logit + e_false_logit)
        uniform = tf.random.uniform(shape=tf.shape(input=probability),
                                    dtype=tf_util.get_dtype(type='float'))
        sampled = tf.greater_equal(x=probability, y=uniform)

        with tf.control_dependencies(control_inputs=dependencies):
            return tf.where(condition=(temperature < epsilon),
                            x=definite,
                            y=sampled)