Пример #1
0
    def generate_signal(self, key, context):
        if key == 'action_values_all':
            utils = self.agent.get_utils(self, context)
            values = tf.identity(utils, name="{}_{}_values".format(self.agent.name, self.name))
            return values

        elif key == 'action_values':
            action_values = context.get_signal('action_values_all', self, gradient=True)
            actions = context.get_signal('actions')
            action_values = tf.reduce_sum(actions * action_values, axis=-1, keepdims=True)

            mask = context.get_signal('mask')
            label = "{}-estimated_action_value".format(self.display_name)
            context.add_recorded_value(label, masked_mean(action_values, mask))

            return action_values
        elif key == 'one_step_td_errors':
            rewards = context.get_signal('rewards')
            gamma = context.get_signal('gamma')
            action_values = context.get_signal('action_values', self, gradient=True)

            shifted_values = tf_roll(action_values, 1, fill=0.0, reverse=True, axis=0)

            one_step_estimate = rewards + gamma * shifted_values
            td_errors = one_step_estimate - action_values

            mask = context.get_signal('mask')
            label = "{}-one_step_td_error".format(self.display_name)
            context.add_recorded_value(label, masked_mean(td_errors, mask))

            return td_errors
        else:
            raise Exception()
Пример #2
0
    def generate_signal(self, signal_key, context):
        if signal_key == "advantage":
            q = context.get_signal('action_values', self.q_estimator)
            v = context.get_signal('values', self.v_estimator)
            advantage = q - v
            advantage = self.post_process(advantage, context)

            mask = context.get_signal("mask")

            mean_advantage = masked_mean(advantage, mask)
            context.add_recorded_value("advantage", mean_advantage)

            mean_abs_advantage = masked_mean(tf.abs(advantage), mask)
            context.add_recorded_value("abs_advantage", mean_abs_advantage)

            return advantage
        elif signal_key == "advantage_all":
            q = context.get_signal('action_values_all', self.q_estimator)
            v = context.get_signal('values', self.v_estimator)
            v = v[..., None]
            advantage = q - v
            advantage = self.post_process(advantage, context)
            return advantage
        else:
            raise Exception("NotImplemented")
Пример #3
0
 def post_process(self, advantage, context):
     if self.standardize:
         mask = context.get_signal('mask')
         mean = masked_mean(advantage, mask)
         _advantage = advantage - mean
         variance = masked_mean(_advantage**2, mask)
         std = tf.sqrt(variance)
         _advantage = tf.cond(std <= 0, lambda: _advantage, lambda: _advantage/std)
         advantage = mask * _advantage + (1-mask) * advantage
     return advantage
Пример #4
0
def mean_kl(p, q, obs, mask):
    """ `p` and `q` are instances of `Policy`. """
    # from tensorflow.python.ops.rnn import dynamic_rnn
    # kl_cell = KLCell(policy, prev_policy)
    # batch_size = tf.shape(obs)[1]
    # initial_state = kl_cell.zero_state(batch_size, tf.float32)

    # kl, _ = dynamic_rnn(
    #     kl_cell, obs, initial_state=initial_state,
    #     parallel_iterations=1, swap_memory=False,
    #     time_major=True)

    # return tf.reduce_mean(kl)

    # Code below requires that we know T at graph build time; need to do this in a python
    # while loop, because taking hessian_vector_products when the thing we are taking
    # the hessian of includes a tensorflow while loop is not currently supported
    batch_size = tf.shape(obs)[1]
    dtype = tf.float32
    p_state, q_state = p.zero_state(batch_size,
                                    dtype), q.zero_state(batch_size, dtype)

    kl = []
    T = int(obs.shape[0])
    for t in range(T):
        p_utils, p_state = p.build_update(obs[t, :, :], p_state)
        q_utils, q_state = q.build_update(obs[t, :, :], q_state)
        kl.append(p.build_kl(p_utils, q_utils))
    kl = tf.stack(kl)
    return masked_mean(kl, mask)
Пример #5
0
    def build_update(self, context):
        self.delta = build_scheduled_value(self.delta_schedule, "delta")

        tvars = self.trainable_variables(for_opt=True)
        self.gradient = tf.gradients(context.objective, tvars)

        mask = context.get_signal('mask')
        kl = context.get_signal('kl', self.policy)

        mean_kl = masked_mean(kl, mask)
        self.fv_product = HessianVectorProduct(mean_kl, tvars)

        self.grad_norm_pure = tf.placeholder(tf.float32,
                                             shape=(),
                                             name="_grad_norm_pure")
        self.grad_norm_natural = tf.placeholder(tf.float32,
                                                shape=(),
                                                name="_grad_norm_natural")
        self.step_norm = tf.placeholder(tf.float32,
                                        shape=(),
                                        name="_step_norm")

        context.add_recorded_values(grad_norm_pure=self.grad_norm_pure,
                                    grad_norm_natural=self.grad_norm_natural,
                                    step_norm=self.step_norm,
                                    train_only=True)
Пример #6
0
    def build_graph(self, context):
        adv_times_ratio = context.get_signal("adv_times_ratio", self, gradient=True)
        mask = context.get_signal("mask")
        objective = masked_mean(adv_times_ratio, mask)

        label = "{}-policy_gradient_objective".format(self.policy.display_name)
        context.add_recorded_value(label, objective)

        return objective
Пример #7
0
    def build_graph(self, context):
        loss = context.get_signal("loss", self, gradient=True)
        mask = context.get_signal("mask")
        objective = -masked_mean(loss, mask)

        label = "{}-differentiable_objective".format(self.policy.display_name)
        context.add_recorded_value(label, objective)

        return objective
Пример #8
0
    def build_graph(self, context):
        entropy = context.get_signal('entropy', self, gradient=True)
        mask = context.get_signal('mask')
        objective = masked_mean(entropy, mask)

        label = "{}-entropy".format(self.policy.display_name)
        context.add_recorded_value(label, objective)

        return objective
Пример #9
0
    def build_graph(self, context):
        td_error = context.get_signal("td_error", self)
        squared_td_error = context.get_signal("squared_td_error", self, gradient=True)

        mask = context.get_signal('mask')
        weights = context.get_signal('weights')

        if self.use_weights:
            td_error *= weights
            squared_td_error *= weights

        mean_td_error = masked_mean(td_error, mask)

        label = "{}-opt-mean_abs_td_error".format(self.value_function.display_name)
        context.add_recorded_value(label, tf.abs(mean_td_error))

        mean_squared_td_error = masked_mean(squared_td_error, mask)

        label = "{}-opt-mean_squared_td_error".format(self.value_function.display_name)
        context.add_recorded_value(label, mean_squared_td_error)

        return -mean_squared_td_error
Пример #10
0
    def generate_signal(self, key, context, **kwargs):
        if key == 'values':
            utils = self.agent.get_utils(self, context)
            values = tf.identity(utils, name="{}_{}_values".format(self.agent.name, self.name))

            mask = context.get_signal('mask')
            label = "{}-estimated_value".format(self.display_name)
            context.add_recorded_value(label, masked_mean(values, mask))

            return values

        elif key == 'one_step_td_errors':
            rewards = context.get_signal('rewards')
            gamma = context.get_signal('gamma')
            c = kwargs.get('c', None)
            rho = context.get_signal('rho', self.policy, c=c)
            values = context.get_signal('values', self, gradient=True)

            shifted_values = tf_roll(values, 1, fill=0.0, reverse=True, axis=0)

            one_step_estimate = rho * (rewards + gamma * shifted_values)
            td_errors = one_step_estimate - values

            mask = context.get_signal('mask')
            label = "{}-one_step_td_error".format(self.display_name)
            context.add_recorded_value(label, masked_mean(td_errors, mask))

            return td_errors

        elif key == 'monte_carlo_td_errors':
            discounted_returns = context.get_signal('discounted_returns', self.policy)
            values = context.get_signal('values', self, gradient=True)

            return discounted_returns[:-1, ...] - values[:-1, ...]

        else:
            raise Exception()
Пример #11
0
    def generate_signal(self, key, context, **kwargs):
        if key == 'log_probs':
            utils = self.agent.get_utils(self, context)
            actions = context.get_signal('actions')
            return self.action_selection.log_probs(utils, actions,
                                                   self.exploration)
        elif key == 'entropy':
            utils = self.agent.get_utils(self, context)
            return self.action_selection.entropy(utils, self.exploration)
        elif key == 'samples':
            utils = self.agent.get_utils(self, context)
            return self.action_selection.sample(utils, self.exploration)
        elif key == 'kl':
            raise Exception("NotImplemented")
        elif key in ['monte_carlo_values', 'monte_carlo_action_values']:
            c = kwargs.get('c', None)
            rho = context.get_signal('rho', self, c=c)
            rewards = context.get_signal('rewards')

            if key == 'monte_carlo_action_values':
                rho = tf_roll(rho, 1, fill=1.0, reverse=True)

            gamma = context.get_signal('gamma')

            elems = (tf.reverse(rho, axis=[0]), tf.reverse(rewards, axis=[0]))

            initializer = tf.zeros_like(rewards[0, ...])

            if key == 'monte_carlo_action_values':
                func = _DoWeightingActionValue(gamma)
            else:
                func = _DoWeightingValue(gamma)

            returns = tf.scan(
                func,
                elems=elems,
                initializer=initializer,
            )

            returns = tf.reverse(returns, axis=[0])
            return returns
        elif key == 'average_monte_carlo_values':
            values = context.get_signal('monte_carlo_values', self, **kwargs)
            average = tf.reduce_mean(values, axis=1, keepdims=True)
            average += tf.zeros_like(values)
            return average
        elif key == 'importance_weights':
            pi_log_probs = context.get_signal("log_probs", self)
            mu_log_probs = context.get_signal("mu_log_probs")
            importance_weights = tf.exp(pi_log_probs - mu_log_probs)

            label = "{}-mean_importance_weight".format(self.display_name)
            mask = context.get_signal("mask")
            context.add_recorded_value(label,
                                       masked_mean(importance_weights, mask))

            return importance_weights
        elif key == 'rho':
            c = kwargs.get('c', None)
            importance_weights = context.get_signal("importance_weights", self)

            if c is not None:
                if c <= 0:
                    rho = importance_weights
                else:
                    rho = tf.minimum(importance_weights, c)
            else:
                rho = tf.ones_like(importance_weights)

            label = "{}-mean_rho_c_{}".format(self.display_name, c)
            mask = context.get_signal("mask")
            context.add_recorded_value(label, masked_mean(rho, mask))

            return rho
        else:
            raise Exception("NotImplemented")
Пример #12
0
    def generate_signal(self, signal_key, context):
        if signal_key == "action_values" and self.to_action_value:
            pass
        elif signal_key == "values" and not self.to_action_value:
            pass
        else:
            raise Exception("NotImplemented")

        rewards = context.get_signal("rewards")
        rho = context.get_signal("rho", self.policy, c=self.importance_c)

        if self.from_action_value:
            if isinstance(self.policy, DiscretePolicy):
                pi_log_probs_all = context.get_signal("log_probs_all", self.policy)
                pi_probs = tf.exp(pi_log_probs_all)
                action_values = context.get_signal("action_values", self.value_function)
                values = tf.reduce_sum(pi_probs * action_values, axis=-1, keepdims=True)
            else:
                action_values = context.get_signal("action_values", self.value_function)
                values = action_values * rho
        else:
            values = context.get_signal("values", self.value_function)

        R = rewards
        V = tf_roll(values, 1, fill=0.0, reverse=True)
        RHO = rho

        # if context.truncated_rollouts:
        #     R = R[:-1, ...]
        #     V = V[:-1, ...]
        #     RHO = RHO[:-1, ...]

        if self.to_action_value:
            RHO = tf_roll(RHO, 1, fill=1.0, reverse=True)

        gamma = context.get_signal("gamma")
        retrace_cell = RetraceCell(
            rewards.shape[-1], gamma, self.lmbda, self.to_action_value)

        retrace_input = (
            tf.reverse(RHO, axis=[0]),
            tf.reverse(R, axis=[0]),
            tf.reverse(V, axis=[0]),
        )

        (retrace, one_step_estimate, adjustment), _ = dynamic_rnn(
            retrace_cell, retrace_input,
            initial_state=V[-1, ...],
            parallel_iterations=1, swap_memory=False, time_major=True)

        one_step_estimate = tf.reverse(one_step_estimate, axis=[0])
        adjustment = tf.reverse(adjustment, axis=[0])
        retrace = tf.reverse(retrace, axis=[0])

        # if context.truncated_rollouts:
        #     retrace = tf.concat([retrace, V[-1, ...]], axis=0)

        mask = context.get_signal("mask")
        label = "{}-one_step_estimate".format(self.name)
        context.add_recorded_value(label, masked_mean(one_step_estimate, mask))
        label = "{}-adjustment".format(self.name)
        context.add_recorded_value(label, masked_mean(adjustment, mask))
        label = "{}-retrace".format(self.name)
        context.add_recorded_value(label, masked_mean(retrace, mask))

        return retrace
Пример #13
0
    def build_core_signals(self):
        self._signals['mask'] = tf.placeholder(tf.float32,
                                               shape=(cfg.T, None, 1),
                                               name="_mask")
        self._signals['done'] = tf.placeholder(tf.float32,
                                               shape=(cfg.T, None, 1),
                                               name="_done")

        self._signals['all_obs'] = tf.placeholder(
            tf.float32,
            shape=(cfg.T + 1 if cfg.T is not None else None, None) +
            self.obs_shape,
            name="_all_obs")

        # observations that we learn about
        self._signals['obs'] = tf.identity(self._signals['all_obs'][:-1, ...],
                                           name="_obs")

        # observations that we use as targets
        self._signals['target_obs'] = tf.identity(
            self._signals['all_obs'][1:, ...], name="_target_obs")

        self._signals['actions'] = tf.placeholder(tf.float32,
                                                  shape=(cfg.T, None) +
                                                  self.action_shape,
                                                  name="_actions")
        self._signals['gamma'] = tf.constant(self.gamma)
        self._signals['batch_size'] = tf.shape(self._signals['obs'])[1]
        self._signals['batch_size_float'] = tf.cast(
            self._signals['batch_size'], tf.float32)

        self._signals['rewards'] = tf.placeholder(tf.float32,
                                                  shape=(cfg.T, None, 1),
                                                  name="_rewards")
        self._signals['returns'] = tf.cumsum(self._signals['rewards'],
                                             axis=0,
                                             reverse=True,
                                             name="_returns")
        self._signals['reward_per_ep'] = tf.reduce_mean(tf.reduce_sum(
            self._signals['rewards'], axis=0),
                                                        name="_reward_per_ep")

        self.add_recorded_values(reward_per_ep=self._signals['reward_per_ep'])

        self._signals['mode'] = tf.placeholder(tf.string, ())

        self._signals['weights'] = tf.placeholder(tf.float32,
                                                  shape=(cfg.T, None, 1),
                                                  name="_weights")

        T = tf.shape(self._signals['mask'])[0]
        discount_matrix = tf_discount_matrix(self.gamma, T)
        discounted_returns = tf.tensordot(discount_matrix,
                                          self._signals['rewards'],
                                          axes=1,
                                          name="_discounted_returns")
        self._signals['discounted_returns'] = discounted_returns

        mean_returns = masked_mean(discounted_returns,
                                   self._signals['mask'],
                                   axis=1,
                                   keepdims=True)
        mean_returns += tf.zeros_like(discounted_returns)
        self._signals['average_discounted_returns'] = mean_returns

        # off-policy
        self._signals['mu_utils'] = tf.placeholder(tf.float32,
                                                   shape=(
                                                       cfg.T,
                                                       None,
                                                   ) + self.mu.param_shape,
                                                   name="_mu_log_probs")
        self._signals['mu_exploration'] = tf.placeholder(
            tf.float32, shape=(None, ), name="_mu_exploration")
        self._signals['mu_log_probs'] = tf.placeholder(tf.float32,
                                                       shape=(cfg.T, None, 1),
                                                       name="_mu_log_probs")

        for obj in self.rl_objects:
            obj.build_core_signals(self)
Пример #14
0
    def build_graph(self, context):
        prev_values = context.get_signal('prev_values', self)
        values = context.get_signal('values', self.value_function, gradient=True)
        variance = context.get_signal('variance', self)
        targets = context.get_signal('values', self.target_generator)

        if self.direct:
            std = tf.sqrt(variance)
            constrained_values = tf.clip_by_value(
                values, prev_values - std * self.epsilon, prev_values + std * self.epsilon)
            objective = -(constrained_values - targets)**2
            divergence = tf.abs(constrained_values - prev_values)

            if self.use_weights:
                weights = context.get_signal('weights')
                objective *= weights

            mask = context.get_signal("mask")

            mean_divergence = masked_mean(tf.reduce_mean(divergence, axis=-1, keepdims=True), mask)
            label = "{}-opt-mean_ve_divergence".format(self.value_function.display_name)
            context.add_recorded_value(label, mean_divergence)

            objective = masked_mean(tf.reduce_mean(objective, axis=-1, keepdims=True), mask)
            label = "{}-opt-ve_direct_objective".format(self.value_function.display_name)
            context.add_recorded_value(label, objective)

            td_error = context.get_signal("td_error", self)
            squared_td_error = context.get_signal("squared_td_error", self)

            mean_td_error = masked_mean(td_error, mask)
            label = "{}-opt-mean_abs_td_error".format(self.value_function.display_name)
            context.add_recorded_value(label, tf.abs(mean_td_error))

            mean_squared_td_error = masked_mean(squared_td_error, mask)
            label = "{}-opt-mean_squared_td_error".format(self.value_function.display_name)
            context.add_recorded_value(label, mean_squared_td_error)

            return objective

        else:
            clipped_ratio = None
            if self.n_samples == 0:
                ratio = tf.exp(0.5 * (2 * targets - values - prev_values) * (values - prev_values) / variance)

                # prev_advantage = (values - targets) ** 2 + variance
                prev_advantage = (prev_values - targets) ** 2 + variance

                if self.epsilon is None:
                    adv_times_ratio = ratio * prev_advantage
                else:
                    clipped_ratio = tf.clip_by_value(ratio, 1-self.epsilon, 1+self.epsilon)
                    adv_times_ratio = tf.minimum(
                        prev_advantage * ratio,
                        prev_advantage * clipped_ratio
                    )
            else:
                T = tf.shape(values)[0]
                batch_size = tf.shape(values)[1]
                samples = tf.random_normal((T, batch_size, self.n_samples)) * tf.sqrt(variance) + prev_values
                ratio = tf.exp(0.5 * (2 * samples - values - prev_values) * (values - prev_values) / variance)
                prev_advantage = (prev_values - targets) ** 2 + variance - (samples - targets)**2

                if self.epsilon is None:
                    adv_times_ratio = ratio * prev_advantage
                else:
                    clipped_ratio = tf.clip_by_value(ratio, 1-self.epsilon, 1+self.epsilon)
                    adv_times_ratio = tf.minimum(
                        prev_advantage * ratio,
                        prev_advantage * clipped_ratio
                    )

            if self.use_weights:
                weights = context.get_signal('weights')
                adv_times_ratio *= weights

            mask = context.get_signal("mask")

            mean_ratio = masked_mean(tf.reduce_mean(ratio, axis=-1, keepdims=True), mask)
            label = "{}-opt-mean_ve_ratio".format(self.value_function.display_name)
            context.add_recorded_value(label, mean_ratio)

            if clipped_ratio is not None:
                mean_clipped_ratio = masked_mean(tf.reduce_mean(clipped_ratio, axis=-1, keepdims=True), mask)
                label = "{}-opt-mean_ve_clipped_ratio".format(self.value_function.display_name)
                context.add_recorded_value(label, mean_clipped_ratio)

            mean_advantage = masked_mean(tf.reduce_mean(prev_advantage, axis=-1, keepdims=True), mask)
            label = "{}-opt-mean_ve_advantage".format(self.value_function.display_name)
            context.add_recorded_value(label, mean_advantage)

            objective = masked_mean(tf.reduce_mean(adv_times_ratio, axis=-1, keepdims=True), mask)
            label = "{}-opt-ve_objective".format(self.value_function.display_name)
            context.add_recorded_value(label, objective)

            td_error = context.get_signal("td_error", self)
            squared_td_error = context.get_signal("squared_td_error", self)

            mean_td_error = masked_mean(td_error, mask)
            label = "{}-opt-mean_abs_td_error".format(self.value_function.display_name)
            context.add_recorded_value(label, tf.abs(mean_td_error))

            mean_squared_td_error = masked_mean(squared_td_error, mask)
            label = "{}-opt-mean_squared_td_error".format(self.value_function.display_name)
            context.add_recorded_value(label, mean_squared_td_error)

            return objective
Пример #15
0
    def generate_signal(self, signal_key, context, **kwargs):
        if signal_key == "prev_log_probs":
            self.log_probs = context.get_signal('log_probs', self.policy)
            self.prev_log_probs = tf.placeholder(tf.float32, shape=self.log_probs.shape, name="_prev_log_probs")
            return self.prev_log_probs
        elif signal_key == "prev_advantage":
            self.advantage = context.get_signal('advantage', self.advantage_estimator)
            self.prev_advantage = tf.placeholder(tf.float32, shape=self.advantage.shape, name="_prev_advantage")
            return self.prev_advantage
        elif signal_key == 'importance_weights':
            pi_log_probs = context.get_signal("prev_log_probs", self)
            mu_log_probs = context.get_signal("mu_log_probs")
            importance_weights = tf.exp(pi_log_probs - mu_log_probs)

            label = "{}-mean_importance_weight".format(self.name)
            mask = context.get_signal("mask")
            context.add_recorded_value(label, masked_mean(importance_weights, mask))

            return importance_weights
        elif signal_key == "rho":
            c = kwargs.get('c', None)
            importance_weights = context.get_signal("importance_weights", self)

            if c is not None:
                if c <= 0:
                    rho = importance_weights
                else:
                    rho = tf.minimum(importance_weights, c)
            else:
                rho = tf.ones_like(importance_weights)

            label = "{}-mean_rho_c_{}".format(self.name, c)
            mask = context.get_signal("mask")
            context.add_recorded_value(label, masked_mean(rho, mask))

            return rho

        elif signal_key == "adv_times_ratio":
            log_probs = context.get_signal('log_probs', self.policy, gradient=True)
            prev_log_probs = context.get_signal('prev_log_probs', self)

            ratio = tf.exp(log_probs - prev_log_probs)

            prev_advantage = context.get_signal('prev_advantage', self)

            if self.epsilon is None or self.epsilon <= 0:
                adv_times_ratio = ratio * prev_advantage
            else:
                adv_times_ratio = tf.minimum(
                    prev_advantage * ratio,
                    prev_advantage * tf.clip_by_value(ratio, 1-self.epsilon, 1+self.epsilon))

            if self.use_weights:
                weights = context.get_signal('weights')
                adv_times_ratio *= weights

            rho = context.get_signal('rho', self, c=self.importance_c)
            adv_times_ratio *= rho

            return adv_times_ratio
        else:
            raise Exception("NotImplemented")