def test_generate_rollout_gest_best_action(self, env): """Test generate_rollout() uses get_best_action correctly.""" env = Env(mode="train") rollout, _ = env.generate_rollout(get_best_action=lambda x: 0) for _, action, _, _, _ in rollout: assert action == 0
def test_generate_rollout_cost_threshold(self, env, get_best_action): """Test generate_rollout() does not have a cost over 1.""" env = Env(mode="train") rollout, episode_cost = env.generate_rollout(get_best_action=None) for (_, _, cost, _, _) in rollout: assert 0 <= cost <= 1
def test_train_mode_reset(self): """Test reset() in train mode.""" train_env = Env(mode="train") x, x_, theta, theta_ = train_env.reset() assert abs(x) <= 2.3 assert x_ == 0 assert abs(theta) <= 0.3 assert theta_ == 0
def test_generate_rollout_episode_cost(self, env, get_best_action): """Test generate_rollout()'s second return value episode_cost.""" env = Env(mode="train") rollout, episode_cost = env.generate_rollout(get_best_action=None) total_cost = 0 for _, _, cost, _, _ in rollout: total_cost += cost assert episode_cost == total_cost
def test_generate_rollout_next_obs(self, env, get_best_action): """Test generate_rollout() generates continued observation.""" env = Env(mode="train") rollout, episode_cost = env.generate_rollout(get_best_action=None) prev_next_obs = rollout[0][3] for obs, _, _, next_obs, _ in rollout[1:]: assert np.array_equal(prev_next_obs, obs) prev_next_obs = next_obs
def test_eval_mode_reset(self): """Test reset() in eval mode.""" eval_env = Env(mode="eval") x, x_, theta, theta_ = eval_env.reset() assert abs(x) <= 1.0 assert x_ == 0 assert abs(theta) <= 0.3 assert theta_ == 0
def test_generate_rollout_with_random_action_done_value( self, env, get_best_action): """Test done values of generate_rollout()e.""" env = Env(mode="train") rollout, episode_cost = env.generate_rollout(get_best_action) for i, (_, _, _, _, done) in enumerate(rollout): if i + 1 < len(rollout): assert not done else: assert done or len(rollout) == env.max_steps
def neuro_fitt_q(epoch, train_env_max_steps, eval_env_max_steps, discount, init_experience=0, seed=None): """Run NFQ.""" CONFIG = AlgorithmConfig( EPOCH=epoch, TRAIN_ENV_MAX_STEPS=train_env_max_steps, EVAL_ENV_MAX_STEPS=eval_env_max_steps, DISCOUNT=discount, INIT_EXPERIENCE=init_experience, INCREMENT_EXPERIENCE=True, HINT_TO_GOAL=True, RANDOM_SEED=seed, TRAIN_RENDER=False, EVAL_RENDER=False, SAVE_PATH="", LOAD_PATH="", USE_TENSORBOARD=False, USE_WANDB=False, ) # Log to File, Console, TensorBoard, W&B logger = get_logger() # Setup environment train_env = CartPoleRegulatorEnv(mode="train", max_steps=train_env_max_steps) eval_env = CartPoleRegulatorEnv(mode="eval", max_steps=eval_env_max_steps) # Fix random seeds if CONFIG.RANDOM_SEED is not None: make_reproducible(CONFIG.RANDOM_SEED, use_numpy=True, use_torch=True) train_env.seed(CONFIG.RANDOM_SEED) eval_env.seed(CONFIG.RANDOM_SEED) #else: # logger.warning("Running without a random seed: this run is NOT reproducible.") # Setup agent nfq_net = NFQNetwork() optimizer = optim.Rprop(nfq_net.parameters()) nfq_agent = NFQAgent(nfq_net, optimizer) # Load trained agent # if CONFIG.LOAD_PATH: # load_models(CONFIG.LOAD_PATH, nfq_net=nfq_net, optimizer=optimizer) # NFQ Main loop # A set of transition samples denoted as D all_rollouts = [] total_cost = 0 if CONFIG.INIT_EXPERIENCE: for _ in range(CONFIG.INIT_EXPERIENCE): rollout, episode_cost = train_env.generate_rollout( None, render=CONFIG.TRAIN_RENDER) all_rollouts.extend(rollout) total_cost += episode_cost stats = EpisodeStats(episode_lengths=np.zeros(CONFIG.EPOCH), episode_rewards=np.zeros(CONFIG.EPOCH)) for epoch in range(CONFIG.EPOCH + 1): # Variant 1: Incermentally add transitions (Section 3.4) # TODO(seungjaeryanlee): Done before or after training? if CONFIG.INCREMENT_EXPERIENCE: new_rollout, episode_cost = train_env.generate_rollout( nfq_agent.get_best_action, render=CONFIG.TRAIN_RENDER) all_rollouts.extend(new_rollout) total_cost += episode_cost state_action_b, target_q_values = nfq_agent.generate_pattern_set( all_rollouts) # Variant 2: Clamp function to zero in goal region # TODO(seungjaeryanlee): Since this is a regulator setting, should it # not be clamped to zero? if CONFIG.HINT_TO_GOAL: goal_state_action_b, goal_target_q_values = train_env.get_goal_pattern_set( ) goal_state_action_b = torch.FloatTensor(goal_state_action_b) goal_target_q_values = torch.FloatTensor(goal_target_q_values) state_action_b = torch.cat([state_action_b, goal_state_action_b], dim=0) target_q_values = torch.cat( [target_q_values, goal_target_q_values], dim=0) loss = nfq_agent.train((state_action_b, target_q_values)) # TODO(seungjaeryanlee): Evaluation should be done with 3000 episodes eval_episode_length, eval_success, eval_episode_cost = nfq_agent.evaluate( eval_env, CONFIG.EVAL_RENDER) if eval_success: break #stats.episode_rewards[epoch] = eval_episode_cost stats.episode_rewards[epoch] = eval_episode_length + 1 stats.episode_lengths[epoch] = eval_episode_length train_env.close() eval_env.close() return stats