예제 #1
0
    def test_generate_rollout_gest_best_action(self, env):
        """Test generate_rollout() uses get_best_action correctly."""
        env = Env(mode="train")
        rollout, _ = env.generate_rollout(get_best_action=lambda x: 0)

        for _, action, _, _, _ in rollout:
            assert action == 0
예제 #2
0
    def test_generate_rollout_cost_threshold(self, env, get_best_action):
        """Test generate_rollout() does not have a cost over 1."""
        env = Env(mode="train")
        rollout, episode_cost = env.generate_rollout(get_best_action=None)

        for (_, _, cost, _, _) in rollout:
            assert 0 <= cost <= 1
예제 #3
0
    def test_train_mode_reset(self):
        """Test reset() in train mode."""
        train_env = Env(mode="train")
        x, x_, theta, theta_ = train_env.reset()

        assert abs(x) <= 2.3
        assert x_ == 0
        assert abs(theta) <= 0.3
        assert theta_ == 0
예제 #4
0
    def test_generate_rollout_episode_cost(self, env, get_best_action):
        """Test generate_rollout()'s second return value episode_cost."""
        env = Env(mode="train")
        rollout, episode_cost = env.generate_rollout(get_best_action=None)

        total_cost = 0
        for _, _, cost, _, _ in rollout:
            total_cost += cost
        assert episode_cost == total_cost
예제 #5
0
    def test_generate_rollout_next_obs(self, env, get_best_action):
        """Test generate_rollout() generates continued observation."""
        env = Env(mode="train")
        rollout, episode_cost = env.generate_rollout(get_best_action=None)

        prev_next_obs = rollout[0][3]
        for obs, _, _, next_obs, _ in rollout[1:]:
            assert np.array_equal(prev_next_obs, obs)
            prev_next_obs = next_obs
예제 #6
0
    def test_eval_mode_reset(self):
        """Test reset() in eval mode."""
        eval_env = Env(mode="eval")
        x, x_, theta, theta_ = eval_env.reset()

        assert abs(x) <= 1.0
        assert x_ == 0
        assert abs(theta) <= 0.3
        assert theta_ == 0
예제 #7
0
    def test_generate_rollout_with_random_action_done_value(
            self, env, get_best_action):
        """Test done values of generate_rollout()e."""
        env = Env(mode="train")
        rollout, episode_cost = env.generate_rollout(get_best_action)

        for i, (_, _, _, _, done) in enumerate(rollout):
            if i + 1 < len(rollout):
                assert not done
            else:
                assert done or len(rollout) == env.max_steps
예제 #8
0
def neuro_fitt_q(epoch,
                 train_env_max_steps,
                 eval_env_max_steps,
                 discount,
                 init_experience=0,
                 seed=None):
    """Run NFQ."""
    CONFIG = AlgorithmConfig(
        EPOCH=epoch,
        TRAIN_ENV_MAX_STEPS=train_env_max_steps,
        EVAL_ENV_MAX_STEPS=eval_env_max_steps,
        DISCOUNT=discount,
        INIT_EXPERIENCE=init_experience,
        INCREMENT_EXPERIENCE=True,
        HINT_TO_GOAL=True,
        RANDOM_SEED=seed,
        TRAIN_RENDER=False,
        EVAL_RENDER=False,
        SAVE_PATH="",
        LOAD_PATH="",
        USE_TENSORBOARD=False,
        USE_WANDB=False,
    )

    # Log to File, Console, TensorBoard, W&B
    logger = get_logger()

    # Setup environment
    train_env = CartPoleRegulatorEnv(mode="train",
                                     max_steps=train_env_max_steps)
    eval_env = CartPoleRegulatorEnv(mode="eval", max_steps=eval_env_max_steps)

    # Fix random seeds
    if CONFIG.RANDOM_SEED is not None:
        make_reproducible(CONFIG.RANDOM_SEED, use_numpy=True, use_torch=True)
        train_env.seed(CONFIG.RANDOM_SEED)
        eval_env.seed(CONFIG.RANDOM_SEED)
    #else:
    #    logger.warning("Running without a random seed: this run is NOT reproducible.")

    # Setup agent
    nfq_net = NFQNetwork()
    optimizer = optim.Rprop(nfq_net.parameters())
    nfq_agent = NFQAgent(nfq_net, optimizer)

    # Load trained agent
    # if CONFIG.LOAD_PATH:
    #     load_models(CONFIG.LOAD_PATH, nfq_net=nfq_net, optimizer=optimizer)

    # NFQ Main loop
    # A set of transition samples denoted as D
    all_rollouts = []
    total_cost = 0

    if CONFIG.INIT_EXPERIENCE:
        for _ in range(CONFIG.INIT_EXPERIENCE):
            rollout, episode_cost = train_env.generate_rollout(
                None, render=CONFIG.TRAIN_RENDER)
            all_rollouts.extend(rollout)
            total_cost += episode_cost

    stats = EpisodeStats(episode_lengths=np.zeros(CONFIG.EPOCH),
                         episode_rewards=np.zeros(CONFIG.EPOCH))

    for epoch in range(CONFIG.EPOCH + 1):
        # Variant 1: Incermentally add transitions (Section 3.4)
        # TODO(seungjaeryanlee): Done before or after training?
        if CONFIG.INCREMENT_EXPERIENCE:
            new_rollout, episode_cost = train_env.generate_rollout(
                nfq_agent.get_best_action, render=CONFIG.TRAIN_RENDER)
            all_rollouts.extend(new_rollout)
            total_cost += episode_cost

        state_action_b, target_q_values = nfq_agent.generate_pattern_set(
            all_rollouts)

        # Variant 2: Clamp function to zero in goal region
        # TODO(seungjaeryanlee): Since this is a regulator setting, should it
        #                        not be clamped to zero?
        if CONFIG.HINT_TO_GOAL:
            goal_state_action_b, goal_target_q_values = train_env.get_goal_pattern_set(
            )
            goal_state_action_b = torch.FloatTensor(goal_state_action_b)
            goal_target_q_values = torch.FloatTensor(goal_target_q_values)
            state_action_b = torch.cat([state_action_b, goal_state_action_b],
                                       dim=0)
            target_q_values = torch.cat(
                [target_q_values, goal_target_q_values], dim=0)

        loss = nfq_agent.train((state_action_b, target_q_values))

        # TODO(seungjaeryanlee): Evaluation should be done with 3000 episodes
        eval_episode_length, eval_success, eval_episode_cost = nfq_agent.evaluate(
            eval_env, CONFIG.EVAL_RENDER)

        if eval_success:
            break

        #stats.episode_rewards[epoch] = eval_episode_cost
        stats.episode_rewards[epoch] = eval_episode_length + 1
        stats.episode_lengths[epoch] = eval_episode_length

    train_env.close()
    eval_env.close()

    return stats