예제 #1
0
 def test_add_with_dimension_mismatch(self):
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['action_t'] = np.random.random((5, 4))
         insert_inputs_to_rollout(inputs, rollout)
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['reward_t'] = np.random.random((5, ))
         insert_inputs_to_rollout(inputs, rollout)
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['value_t'] = np.random.random((5, ))
         insert_inputs_to_rollout(inputs, rollout)
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['log_prob_t'] = np.random.random((5, ))
         insert_inputs_to_rollout(inputs, rollout)
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['terminal_t'] = np.random.random((5, ))
         insert_inputs_to_rollout(inputs, rollout)
예제 #2
0
 def setup_method(self):
     self.network = DummyNetwork()
     self.rollout = Rollout()
     self.metrics = DummyMetrics()
     self.controller = PPOController(
         self.network, self.rollout, self.metrics, num_envs=4,
         time_horizon=128, epoch=4, batch_size=32, gamma=0.99, lam=0.9)
예제 #3
0
 def test_size(self):
     rollout = Rollout()
     self.assertEqual(rollout.size(), 0)
     inputs = make_inputs()
     insert_inputs_to_rollout(inputs, rollout)
     self.assertEqual(rollout.size(), 1)
     insert_inputs_to_rollout(inputs, rollout)
     self.assertEqual(rollout.size(), 2)
예제 #4
0
def main(args):
    # environments
    env = BatchEnvWrapper(
        make_envs(args.env, args.num_envs, args.reward_scale), args.render)
    env.seed(args.seed)
    eval_env = BatchEnvWrapper(
        make_envs(args.env, args.num_envs, args.reward_scale))
    eval_env.seed(args.seed)
    num_actions = env.action_space.shape[0]

    # network parameters
    params = PPONetworkParams(fcs=args.layers,
                              num_actions=num_actions,
                              state_shape=env.observation_space.shape,
                              num_envs=args.num_envs,
                              batch_size=args.batch_size,
                              epsilon=args.epsilon,
                              learning_rate=args.lr,
                              grad_clip=args.grad_clip,
                              value_factor=args.value_factor,
                              entropy_factor=args.entropy_factor)

    # deep neural network
    network = PPONetwork(params)

    # rollout buffer
    rollout = Rollout()

    # metrics
    saver = tf.train.Saver()
    metrics = Metrics(args.name, args.log_adapter, saver)

    # controller
    controller = PPOController(network, rollout, metrics, args.num_envs,
                               args.time_horizon, args.epoch, args.batch_size,
                               args.gamma, args.lam, args.final_steps,
                               args.log_interval, args.save_interval,
                               args.eval_interval)

    # view
    view = View(controller)

    # evaluation
    eval_controller = EvalController(network, metrics, args.eval_episodes)
    eval_view = View(eval_controller)

    # save hyperparameters
    metrics.log_parameters(vars(args))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # save model graph for debugging
        metrics.set_model_graph(sess.graph)

        if args.load is not None:
            saver.restore(sess, args.load)

        interact(env, view, eval_env, eval_view, batch=True)
예제 #5
0
    def test_fetch(self):
        gamma = np.random.random()
        lam = np.random.random()
        rollout = Rollout()
        inputs1 = make_inputs()
        insert_inputs_to_rollout(inputs1, rollout)

        with pytest.raises(AssertionError):
            rollout.fetch(gamma, lam)

        inputs2 = make_inputs()
        insert_inputs_to_rollout(inputs2, rollout)
        trajectory = rollout.fetch(gamma, lam)

        assert np.all(inputs1['obs_t'] == trajectory['obs_t'][0])
        assert np.all(inputs1['action_t'] == trajectory['actions_t'][0])
        assert np.all(inputs1['log_prob_t'] == trajectory['log_probs_t'][0])
        assert trajectory['returns_t'].shape == (1, 4)
        assert trajectory['advantages_t'].shape == (1, 4)
예제 #6
0
    def test_add_success(self):
        rollout = Rollout()
        inputs1 = make_inputs()
        insert_inputs_to_rollout(inputs1, rollout)
        assert_inputs_with_rollout(inputs1, rollout, 0)

        inputs2 = make_inputs()
        insert_inputs_to_rollout(inputs2, rollout)
        assert_inputs_with_rollout(inputs1, rollout, 0)
        assert_inputs_with_rollout(inputs2, rollout, 1)
예제 #7
0
 def test_add_with_shape_error(self):
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['reward_t'] = np.random.random((4, 5))
         insert_inputs_to_rollout(inputs, rollout)
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['value_t'] = np.random.random((4, 5))
         insert_inputs_to_rollout(inputs, rollout)
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['log_prob_t'] = np.random.random((4, 5))
         insert_inputs_to_rollout(inputs, rollout)
     with pytest.raises(AssertionError):
         rollout = Rollout()
         inputs = make_inputs()
         inputs['terminal_t'] = np.random.random((4, 5))
         insert_inputs_to_rollout(inputs, rollout)
예제 #8
0
def main(args):
    env = BatchEnvWrapper(
        make_envs(args.env, args.num_envs, args.reward_scale), args.render)
    eval_env = BatchEnvWrapper(
        make_envs(args.env, args.num_envs, args.reward_scale))

    num_actions = env.action_space.shape[0]

    network = PPONetwork(args.layers, env.observation_space.shape,
                         args.num_envs, num_actions, args.batch_size,
                         args.epsilon, args.lr, args.grad_clip,
                         args.value_factor, args.entropy_factor)

    rollout = Rollout()

    saver = tf.train.Saver()
    metrics = Metrics(args.name, args.log_adapter, saver)

    controller = PPOController(network, rollout, metrics, args.num_envs,
                               args.time_horizon, args.epoch, args.batch_size,
                               args.gamma, args.lam, args.final_steps,
                               args.log_interval, args.save_interval,
                               args.eval_interval)
    view = View(controller)

    eval_controller = EvalController(network, metrics, args.eval_episodes)
    eval_view = View(eval_controller)

    # save hyperparameters
    metrics.log_parameters(vars(args))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # save model graph for debugging
        metrics.set_model_graph(sess.graph)

        if args.load is not None:
            saver.restore(sess, args.load)

        batch_interact(env, view, eval_env, eval_view)
예제 #9
0
class TestPPOController:
    def setup_method(self):
        self.network = DummyNetwork()
        self.rollout = Rollout()
        self.metrics = DummyMetrics()
        self.controller = PPOController(
            self.network, self.rollout, self.metrics, num_envs=4,
            time_horizon=128, epoch=4, batch_size=32, gamma=0.99, lam=0.9)

    def test_step(self):
        output = make_output(batch_size=4, batch=True)
        self.network._infer = MagicMock(return_value=output)
        self.network._infer_arguments = MagicMock(return_value=['obs_t'])

        inpt = make_input(batch_size=4, batch=True)
        action = self.controller.step(*inpt)

        assert np.all(action == output.action)
        assert self.rollout.size() == 1
        assert np.all(inpt[0] == self.rollout.obs_t[0])
        assert np.all(inpt[1] == self.rollout.rewards_t[0])
        assert np.all(inpt[2] == self.rollout.terminals_t[0])
        assert np.all(output.action == self.rollout.actions_t[0])
        assert np.all(output.value == self.rollout.values_t[0])
        assert np.all(output.log_prob == self.rollout.log_probs_t[0])

    def test_should_update(self):
        output = make_output(batch_size=4, batch=True)
        self.network._infer = MagicMock(return_value=output)
        self.network._infer_arguments = MagicMock(return_value=['obs_t'])

        inpt = make_input(batch_size=4, batch=True)
        for i in range(128):
            self.controller.step(*inpt)
            assert not self.controller.should_update()
        self.controller.step(*inpt)
        assert self.controller.should_update()

    def test_batches(self):
        output = make_output(batch_size=4, batch=True)
        self.network._infer_arguments = MagicMock(return_value=['obs_t'])

        input_history = []
        output_history = []
        for i in range(129):
            inpt = make_input(batch_size=4, batch=True)
            self.network._infer = MagicMock(return_value=output)
            action = self.controller.step(*inpt)
            input_history.append(inpt)
            output_history.append(output)

        for key in ['obs_t', 'actions_t', 'log_probs_t', 'returns_t', 'advantages_t', 'values_t']:
            count = 0
            for batch in self.controller._batches():
                count += 1
                assert key in batch
                assert batch[key].shape[0] == 32
                if key == 'obs_t':
                    assert batch[key].shape[1:] == inpt[0].shape[1:]
                elif key == 'actions_t':
                    assert batch[key].shape[1] == action.shape[1]
                elif key == 'log_probs_t':
                    assert len(batch[key].shape) == 1
                elif key == 'returns_t':
                    assert len(batch[key].shape) == 1
                elif key == 'advantages_t':
                    assert len(batch[key].shape) == 1
                elif key == 'values_t':
                    assert len(batch[key].shape) == 1
            assert count == 128 * 4 // 32

    def test_batch_with_short_trajectory_error(self):
        output = make_output(batch_size=4, batch=True)
        self.network._infer_arguments = MagicMock(return_value=['obs_t'])
        self.network._infer = MagicMock(return_value=output)

        inpt = make_input(batch_size=4, batch=True)
        action = self.controller.step(*inpt)
        with pytest.raises(AssertionError):
            self.controller._batches()

    def test_update_with_should_update_false(self):
        inpt = make_input(batch_size=4, batch=True)
        output = make_output(batch_size=4, batch=True)
        self.network._infer = MagicMock(return_value=output)
        self.network._infer_arguments = MagicMock(return_value=['obs_t'])

        for i in range(20):
            action = self.controller.step(*inpt)
        with pytest.raises(AssertionError):
            self.controller.update()

    def test_update_success(self):
        inpt = make_input(batch_size=4, batch=True)
        output = make_output(batch_size=4, batch=True)
        loss = np.random.random()
        self.network._infer = MagicMock(return_value=output)
        self.network._infer_arguments = MagicMock(return_value=['obs_t'])
        self.network._update_arguments = MagicMock(return_value=['obs_t', 'actions_t', 'returns_t', 'advantages_t', 'log_probs_t'])
        self.network._update = MagicMock(return_value=loss)

        for i in range(129):
            action = self.controller.step(*inpt)

        assert np.allclose(self.controller.update(), loss)
        assert self.rollout.size() == 0
        assert self.network._update.call_count == 128 * 4 * 4 // 32