Exemplo n.º 1
0
    def build_graph(self, input_type, model):
        # pylint: disable=W0201
        self.state_ph = tf.placeholder(input_type,
                                       name='state',
                                       shape=(None, *self.state_dim))
        self.old_logp_ph = tf.placeholder(tf.float32,
                                          name='old_log_p',
                                          shape=(None, 1))
        self.adv_ph = tf.placeholder(tf.float32,
                                     name='advantage',
                                     shape=(None, 1))
        self.old_v_ph = tf.placeholder(tf.float32,
                                       name='old_v',
                                       shape=(None, 1))
        self.target_v_ph = tf.placeholder(tf.float32,
                                          name='target_value',
                                          shape=(None, 1))

        pi_latent, self.out_v = model(self.state_ph)

        if self.action_type == 'Categorical':
            self.behavior_action_ph = tf.placeholder(tf.int32,
                                                     name='behavior_action',
                                                     shape=(None, ))
            dist_param = pi_latent
        elif self.action_type == 'DiagGaussian':
            # fixme: add input dependant log_std logic
            self.behavior_action_ph = tf.placeholder(tf.float32,
                                                     name='real_action',
                                                     shape=(None,
                                                            self.action_dim))
            log_std = tf.get_variable('pi_logstd',
                                      shape=(1, self.action_dim),
                                      initializer=tf.zeros_initializer())
            dist_param = tf.concat([pi_latent, pi_latent * 0.0 + log_std],
                                   axis=-1)
        else:
            raise NotImplementedError(
                'action type: {} not match any implemented distributions.'.
                format(self.action_type))

        self.dist.init_by_param(dist_param)
        self.action = self.dist.sample()
        self.action_log_prob = self.dist.log_prob(self.action)
        self.actor_var = TFVariables([self.action_log_prob, self.out_v],
                                     self.sess)

        self.actor_loss = actor_loss_with_entropy(self.dist, self.adv_ph,
                                                  self.old_logp_ph,
                                                  self.behavior_action_ph,
                                                  self.clip_ratio,
                                                  self.ent_coef)
        self.critic_loss = critic_loss(self.target_v_ph, self.out_v,
                                       self.old_v_ph, self.vf_clip)
        self.loss = self.actor_loss + self.critic_loss_coef * self.critic_loss
        self.train_op = self.build_train_op(self.loss)

        self.sess.run(tf.initialize_all_variables())
Exemplo n.º 2
0
        def split_batches(tensor, drop_last=False):
            batch_count = tf.shape(tensor)[0] // batch_step
            reshape_tensor = tf.reshape(
                tensor,
                tf.concat([[batch_count, batch_step],
                           tf.shape(tensor)[1:]],
                          axis=0),
            )

            # swap B and T axes
            res = tf.transpose(
                reshape_tensor,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))),
            )

            if drop_last:
                return res[:-1]
            return res
Exemplo n.º 3
0
    def build_train_graph(self):
        self.obs = tf.placeholder(self.obs_type, name="obs",
                                  shape=(None, ) + tuple(self.state_dim))
        self.action = tf.placeholder(tf.int32, name="action",
                                     shape=(None, self.td_step))
        target_value_shape = (None, ) + (1 + self.td_step, self.value_support_size)
        self.target_value = tf.placeholder(tf.float32, name="value",
                                           shape=target_value_shape)
        self.target_reward = tf.placeholder(tf.float32, name="reward",
                                            shape=(None, ) + (1 + self.td_step, self.reward_support_size))
        self.target_policy = tf.placeholder(tf.float32, name="policy",
                                            shape=(None, ) + (1 + self.td_step, self.action_dim))
        self.loss_weights = tf.placeholder(tf.float32, name="loss_weights", shape=(None, 1))

        hidden_state = self.representation_network(self.obs)
        policy_logits, value = self.policy_network(hidden_state)

        loss = cross_entropy(policy_logits, self.target_policy[:, 0], self.loss_weights)
        loss += cross_entropy(value, self.target_value[:, 0], self.loss_weights)

        gradient_scale = 1.0 / self.td_step
        for i in range(self.td_step):
            action = tf.one_hot(self.action[:, i], self.action_dim)
            action = tf.reshape(action, (-1, self.action_dim,))
            conditioned_state = tf.concat((hidden_state, action), axis=-1)
            hidden_state, reward = self.dynamic_network(conditioned_state)
            policy_logits, value = self.policy_network(hidden_state)
            hidden_state = scale_gradient(hidden_state, 0.5)

            l = cross_entropy(reward, self.target_reward[:, i], self.loss_weights)
            l += cross_entropy(policy_logits, self.target_policy[:, i + 1], self.loss_weights)
            l += cross_entropy(value, self.target_value[:, i + 1], self.loss_weights)
            loss += scale_gradient(l, gradient_scale)

        for weights in self.full_model.get_weights():
            loss += self.weight_decay * tf.nn.l2_loss(weights)
        self.loss = loss
        self.train_op = self.optimizer.minimize(loss)
Exemplo n.º 4
0
def from_logic_outputs(behaviour_policy_logic_outputs,
                       target_policy_logic_outputs,
                       actions,
                       discounts,
                       rewards,
                       values,
                       bootstrap_value,
                       clip_importance_sampling_threshold=1.0,
                       clip_pg_importance_sampling_threshold=1.0):
    """
    Calculate vtrace with logic outputs.

    :param behaviour_policy_logic_outputs: behaviour_policy_logic_outputs
    :param target_policy_logic_outputs: target_policy_logic_outputs
    :param actions:
    :param discounts:
    :param rewards:
    :param values:
    :param bootstrap_value:
    :param clip_importance_sampling_threshold:
    :param clip_pg_importance_sampling_threshold:
    :return:
    """
    behaviour_policy_logic_outputs = tf.convert_to_tensor(
        behaviour_policy_logic_outputs, dtype=tf.float32)
    target_policy_logic_outputs = tf.convert_to_tensor(
        target_policy_logic_outputs, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)

    # support [T, B, Action_dimension]
    behaviour_policy_logic_outputs.shape.assert_has_rank(3)
    target_policy_logic_outputs.shape.assert_has_rank(3)
    actions.shape.assert_has_rank(2)

    target_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=target_policy_logic_outputs, labels=actions)

    behaviour_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=behaviour_policy_logic_outputs, labels=actions)

    # log importance sampling weight
    importance_sampling_weights = tf.exp(target_log_prob - behaviour_log_prob)

    clipped_importance_sampling_weight = tf.minimum(
        clip_importance_sampling_threshold, importance_sampling_weights)
    clipped_pg_importance_sampling_weight = tf.minimum(
        clip_pg_importance_sampling_threshold, importance_sampling_weights)

    # coefficient, similar to the 'trace cutting'
    coefficient = tf.minimum(1.0, importance_sampling_weights)

    next_values = tf.concat(
        [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)

    # temporal difference, as the fixed point
    deltas = clipped_importance_sampling_weight * (
        rewards + discounts * next_values - values)
    sequences = (deltas, discounts, coefficient)

    # calculate Vtrace with tf.scan, and set reverse: True, back --> begin
    def scan_fn(cumulative_value, sequence_item):
        _delta, _discount, _coefficient = sequence_item
        return _delta + _discount * _coefficient * cumulative_value

    last_values = tf.zeros_like(bootstrap_value)
    temporal_difference = tf.scan(
        fn=scan_fn,
        elems=sequences,
        initializer=last_values,
        parallel_iterations=1,
        back_prop=False,
        reverse=True,
    )

    value_of_states = tf.add(temporal_difference, values)
    # Advantage for policy gradient.
    value_of_next_state = tf.concat(
        [value_of_states[1:],
         tf.expand_dims(bootstrap_value, 0)], axis=0)
    pg_advantages = clipped_pg_importance_sampling_weight * (
        rewards + discounts * value_of_next_state - values)

    value_of_states = tf.stop_gradient(value_of_states)
    pg_advantages = tf.stop_gradient(pg_advantages)
    return value_of_states, pg_advantages