Exemplo n.º 1
0
    def test_v_trace_function_more_complex(self):
        v_trace_function = VTraceFunction()
        v_trace_function_reference = VTraceFunction(backend="python")

        action_space = IntBox(9,
                              add_batch_rank=True,
                              add_time_rank=True,
                              time_major=True)
        action_space_flat = FloatBox(shape=(9, ),
                                     add_batch_rank=True,
                                     add_time_rank=True,
                                     time_major=True)
        input_spaces = dict(logits_actions_pi=self.time_x_batch_x_9_space,
                            log_probs_actions_mu=self.time_x_batch_x_9_space,
                            actions=action_space,
                            actions_flat=action_space_flat,
                            discounts=self.time_x_batch_x_1_space,
                            rewards=self.time_x_batch_x_1_space,
                            values=self.time_x_batch_x_1_space,
                            bootstrapped_values=self.time_x_batch_x_1_space)

        test = ComponentTest(component=v_trace_function,
                             input_spaces=input_spaces)

        size = (100, 16)
        logits_actions_pi = self.time_x_batch_x_9_space.sample(size=size)
        logits_actions_mu = self.time_x_batch_x_9_space.sample(size=size)
        log_probs_actions_mu = np.log(softmax(logits_actions_mu))
        actions = action_space.sample(size=size)
        actions_flat = one_hot(actions, depth=action_space.num_categories)
        # Set some discounts to 0.0 (these will mark the end of episodes, where the value is 0.0).
        discounts = np.random.choice([0.0, 0.99],
                                     size=size + (1, ),
                                     p=[0.1, 0.9])
        rewards = self.time_x_batch_x_1_space.sample(size=size)
        values = self.time_x_batch_x_1_space.sample(size=size)
        bootstrapped_values = self.time_x_batch_x_1_space.sample(
            size=(1, size[1]))

        input_ = [
            logits_actions_pi, log_probs_actions_mu, actions, actions_flat,
            discounts, rewards, values, bootstrapped_values
        ]

        vs_expected, pg_advantages_expected = v_trace_function_reference._graph_fn_calc_v_trace_values(
            *input_)

        test.test(("calc_v_trace_values", input_),
                  expected_outputs=[vs_expected, pg_advantages_expected],
                  decimals=4)
Exemplo n.º 2
0
    def __init__(self,
                 discount=0.99,
                 reward_clipping="clamp_one",
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor (gamma) to use.
            reward_clipping (Optional[str]): One of None, "clamp_one" or "soft_asymmetric". Default: "clamp_one".
            weight_pg (float): The coefficient used for the policy gradient loss term (L[PG]).
            weight_baseline (float): The coefficient used for the Value-function baseline term (L[V]).
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
                In the paper, values between 0.01 and 0.00005 are used via log-uniform search.
        """
        # graph_fn_num_outputs=dict(_graph_fn_loss_per_item=2) <- debug
        super(IMPALALossFunction,
              self).__init__(scope=kwargs.pop("scope", "impala-loss-func"),
                             **kwargs)

        self.discount = discount
        self.v_trace_function = VTraceFunction()

        self.reward_clipping = reward_clipping

        self.weight_pg = weight_pg if weight_pg is not None else 1.0
        self.weight_baseline = weight_baseline if weight_baseline is not None else 0.5
        self.weight_entropy = weight_entropy if weight_entropy is not None else 0.00025

        self.action_space = None

        self.add_components(self.v_trace_function)
Exemplo n.º 3
0
    def __init__(self,
                 discount=0.99,
                 reward_clipping="clamp_one",
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 slice_actions=False,
                 slice_rewards=False,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor (gamma) to use.
            reward_clipping (Optional[str]): One of None, "clamp_one" or "soft_asymmetric". Default: "clamp_one".
            weight_pg (float): The coefficient used for the policy gradient loss term (L[PG]).
            weight_baseline (float): The coefficient used for the Value-function baseline term (L[V]).

            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
                In the paper, values between 0.01 and 0.00005 are used via log-uniform search.

            slice_actions (bool): Whether to slice off the very first action coming in from the
                caller. This must be True if actions/rewards are part of the state (via the keys "previous_action" and
                "previous_reward"). Default: False.

            slice_rewards (bool): Whether to slice off the very first reward coming in from the
                caller. This must be True if actions/rewards are part of the state (via the keys "previous_action" and
                "previous_reward"). Default: False.
        """
        super(IMPALALossFunction,
              self).__init__(scope=kwargs.pop("scope", "impala-loss-func"),
                             **kwargs)

        self.discount = discount
        self.v_trace_function = VTraceFunction()

        self.reward_clipping = reward_clipping

        self.weight_pg = weight_pg if weight_pg is not None else 1.0
        self.weight_baseline = weight_baseline if weight_baseline is not None else 0.5
        self.weight_entropy = weight_entropy if weight_entropy is not None else 0.00025

        self.slice_actions = slice_actions
        self.slice_rewards = slice_rewards

        self.action_space = None

        self.add_components(self.v_trace_function)