Python SharedValueFunctionPolicy 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: rlgraph.components.policies.shared_value_function_policy

hotexamples.com에서의 예제들: 2

Python SharedValueFunctionPolicy - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 rlgraph.components.policies.shared_value_function_policy.SharedValueFunctionPolicy에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

SharedValueFunctionPolicy(2)

자주 사용되는 메소드들

SharedValueFunctionPolicy (2)

예제 #1

파일 보기

    def test_large_impala_policy_without_agent(self):
        """
        Creates a large IMPALA architecture network inside a policy and runs a few input samples through it.
        """
        # Create the network.
        large_impala_architecture = LargeIMPALANetwork(worker_sample_size=1)
        # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function).
        policy = SharedValueFunctionPolicy(
            network_spec=large_impala_architecture,
            action_space=self.action_space,
            switched_off_apis={
                #"get_action_from_logits_and_parameters", "get_action_from_logits_and_probabilities",
                "get_log_likelihood"
            })
        test = ComponentTest(
            policy,
            input_spaces=dict(
                nn_input=self.input_space,
                internal_states=self.internal_states_space,
                #parameters=self.parameters_and_logits_space,
                #logits=self.parameters_and_logits_space
            ),
            action_space=self.action_space,
            execution_spec=dict(disable_monitoring=True))

        # Send a 1x1 sample through the network (1=sequence-length (time-rank), 1=batch-size).
        nn_input = self.input_space.sample(size=(1, 1))
        initial_internal_states = self.internal_states_space.zeros(size=1)
        expected = None
        out = test.test(("get_action", [nn_input, initial_internal_states]),
                        expected_outputs=expected)
        print("First action: {}".format(out["action"]))
        self.assertEquals(out["action"].shape, (1, 1))
        self.assertEquals(out["last_internal_states"][0].shape, (1, 256))
        self.assertEquals(out["last_internal_states"][1].shape, (1, 256))

        # Send another 1x1 sample through the network using the previous internal-state.
        next_nn_input = self.input_space.sample(size=(1, 1))
        expected = None
        out = test.test(
            ("get_action", [next_nn_input, out["last_internal_states"]]),
            expected_outputs=expected)
        print("Second action: {}".format(out["action"]))
        self.assertEquals(out["action"].shape, (1, 1))
        self.assertEquals(out["last_internal_states"][0].shape, (1, 256))
        self.assertEquals(out["last_internal_states"][1].shape, (1, 256))

        test.terminate()

예제 #2

파일 보기

    def test_large_impala_actor_component_without_agent(self):
        """
        Creates a large IMPALA architecture network inside a policy inside an actor component and runs a few input
        samples through it.
        """
        batch_size = 4
        time_steps = 1

        # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function).
        policy = SharedValueFunctionPolicy(
            LargeIMPALANetwork(worker_sample_size=time_steps),
            action_space=self.action_space,
            deterministic=False)
        actor_component = ActorComponent(preprocessor_spec=None,
                                         policy_spec=policy,
                                         exploration_spec=None)

        test = ComponentTest(actor_component,
                             input_spaces=dict(
                                 states=self.input_space,
                                 internal_states=self.internal_states_space),
                             action_space=self.action_space,
                             execution_spec=dict(disable_monitoring=True))

        # Send a sample through the network (sequence-length (time-rank) x batch-size).
        nn_dict_input = self.input_space.sample(size=(time_steps, batch_size))
        initial_internal_states = self.internal_states_space.zeros(
            size=batch_size)
        expected = None
        out = test.test(("get_preprocessed_state_and_action",
                         [nn_dict_input, initial_internal_states]),
                        expected_outputs=expected)
        print("First action: {}".format(out["action"]))
        self.assertEquals(out["action"].shape, (time_steps, batch_size))
        self.assertEquals(out["last_internal_states"][0].shape,
                          (batch_size, 256))
        self.assertEquals(out["last_internal_states"][1].shape,
                          (batch_size, 256))
        # Check preprocessed state (all the same except 'image' channel).
        recursive_assert_almost_equal(
            out["preprocessed_state"],
            dict(
                RGB_INTERLEAVED=nn_dict_input["RGB_INTERLEAVED"],
                INSTR=nn_dict_input["INSTR"],
                previous_action=nn_dict_input["previous_action"],
                previous_reward=nn_dict_input["previous_reward"],
            ))

        # Send another 1x1 sample through the network using the previous internal-state.
        next_nn_input = self.input_space.sample(size=(time_steps, batch_size))
        expected = None
        out = test.test(("get_preprocessed_state_and_action",
                         [next_nn_input, out["last_internal_states"]]),
                        expected_outputs=expected)
        print("Second action: {}".format(out["action"]))
        self.assertEquals(out["action"].shape, (time_steps, batch_size))
        self.assertEquals(out["last_internal_states"][0].shape,
                          (batch_size, 256))
        self.assertEquals(out["last_internal_states"][1].shape,
                          (batch_size, 256))
        # Check preprocessed state (all the same except 'image' channel, which gets divided by 255).
        recursive_assert_almost_equal(
            out["preprocessed_state"],
            dict(
                RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"],
                INSTR=next_nn_input["INSTR"],
                previous_action=next_nn_input["previous_action"],
                previous_reward=next_nn_input["previous_reward"],
            ))

        test.terminate()