Пример #1
0
    def test_train(self):

        memory_init_size = 20
        num_steps = 1000

        agent = NFSPAgent(num_actions=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          reservoir_buffer_capacity=50,
                          batch_size=4,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_size=50,
                          q_replay_memory_init_size=memory_init_size,
                          q_batch_size=4,
                          q_mlp_layers=[10, 10],
                          device=torch.device('cpu'))

        predicted_action, _ = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': {
                0: None,
                1: None
            },
            'raw_legal_actions': ['call', 'raise']
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for _ in range(num_steps):
            agent.sample_episode_policy()
            predicted_action = agent.step({
                'obs': np.random.random_sample((2, )),
                'legal_actions': {
                    0: None,
                    1: None
                }
            })
            self.assertGreaterEqual(predicted_action, 0)
            self.assertLessEqual(predicted_action, 1)

            ts = [{
                'obs': np.random.random_sample((2, )),
                'legal_actions': {
                    0: None,
                    1: None
                }
            },
                  np.random.randint(2), 0, {
                      'obs': np.random.random_sample((2, )),
                      'legal_actions': {
                          0: None,
                          1: None
                      },
                      'raw_legal_actions': ['call', 'raise']
                  }, True]
            agent.feed(ts)
Пример #2
0
    def test_evaluate_with(self):
        # Test average policy and value error here
        sess = tf.compat.v1.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)

        agent = NFSPAgent(sess=sess,
                          scope='nfsp',
                          action_num=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          q_mlp_layers=[10, 10],
                          evaluate_with='average_policy')
        sess.run(tf.compat.v1.global_variables_initializer())
        predicted_action, _ = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        sess.close()
        tf.compat.v1.reset_default_graph()

        sess = tf.compat.v1.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)

        agent = NFSPAgent(sess=sess,
                          scope='nfsp',
                          action_num=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          q_mlp_layers=[10, 10],
                          evaluate_with='random')
        sess.run(tf.compat.v1.global_variables_initializer())
        with self.assertRaises(ValueError):
            predicted_action = agent.eval_step({
                'obs':
                np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            })

        sess.close()
        tf.compat.v1.reset_default_graph()
Пример #3
0
    def test_train(self):

        norm_step = 100
        memory_init_size = 20
        step_num = 1000

        sess = tf.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)
        agent = NFSPAgent(sess=sess,
                          scope='nfsp',
                          action_num=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          reservoir_buffer_capacity=50,
                          batch_size=4,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_size=50,
                          q_replay_memory_init_size=memory_init_size,
                          q_batch_size=4,
                          q_norm_step=norm_step,
                          q_mlp_layers=[10, 10])
        sess.run(tf.global_variables_initializer())

        predicted_action = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for step in range(step_num):
            agent.sample_episode_policy()
            predicted_action = agent.step({
                'obs': np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            })
            self.assertGreaterEqual(predicted_action, 0)
            self.assertLessEqual(predicted_action, 1)

            ts = [{
                'obs': np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            },
                  np.random.randint(2), 0, {
                      'obs': np.random.random_sample((2, )),
                      'legal_actions': [0, 1]
                  }, True]
            agent.feed(ts)
            if step > norm_step + memory_init_size:
                agent.train_rl()

            agent.train_sl()
        sess.close()
        tf.reset_default_graph()