def test_ppo_on_2x2_grid_world(self): """ Creates a PPO Agent and runs it via a Runner on the 2x2 Grid World Env. """ env = GridWorld(world="2x2") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_2x2_gridworld.json"), state_space=GridWorld.grid_world_2x2_flattened_state_space, action_space=env.action_space, execution_spec=dict(seed=15), ) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) # Assume we have learned something. self.assertGreater(results["mean_episode_reward"], -0.2)
def test_cartpole_with_worker(self): env = OpenAIGymEnv("CartPole-v0") agent_config = config_from_path("configs/backend_performance_dqn_cartpole.json") # Test cpu settings for batching here. agent_config["update_spec"] = None agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv("CartPole-v0"), agent=agent, frameskip=1, num_environments=1, worker_executes_preprocessing=False ) result = worker.execute_timesteps(1000) print(result)
def test_sac_on_pendulum(self): """ Creates an SAC-Agent and runs it on Pendulum. """ env = OpenAIGymEnv("Pendulum-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows ) # Note: SAC is more computationally expensive. episodes = 50 results = worker.execute_episodes(episodes) print(results) self.assertTrue(results["timesteps_executed"] == episodes * 200) self.assertTrue(results["episodes_executed"] == episodes) self.assertGreater(results["mean_episode_reward"], -800)
def test_sac_on_cartpole(self): """ Creates an SAC-Agent and runs it on CartPole. """ env = OpenAIGymEnv("CartPole-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, # self.is_windows, episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs: print("episode: return={} ts={}".format(episode_return, timesteps)) ) time_steps = 5000 results = worker.execute_timesteps(time_steps) print(results) self.assertTrue(results["timesteps_executed"] == time_steps) self.assertLessEqual(results["episodes_executed"], time_steps / 20) self.assertGreater(results["mean_episode_reward"], 40.0) self.assertGreater(results["max_episode_reward"], 100.0) self.assertGreater(results["mean_episode_reward_last_10_episodes"], 100.0)
def test_dqn_on_cart_pole(self): """ Creates a DQNAgent and runs it via a Runner on the CartPole Env. """ dummy_env = OpenAIGymEnv("CartPole-v0") agent = DQNAgent.from_spec( config_from_path("configs/dqn_agent_for_cartpole.json"), double_q=False, dueling_q=False, state_space=dummy_env.state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=15), update_spec=dict(update_interval=4, batch_size=24, sync_interval=64), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=15), agent=agent, render=self.is_windows, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 25) self.assertGreaterEqual(results["max_episode_reward"], 100.0) self.assertLessEqual(results["episodes_executed"], 200)
def test_sac_2x2_grid_world_with_container_actions(self): """ Creates a SAC agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path("configs/sac_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = SACAgent.from_spec( agent_config, state_space=FloatBox(shape=(4,)), action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=False, render=False ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results)
def test_ppo_on_cart_pole(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole env. """ env = OpenAIGymEnv("CartPole-v0", seed=36) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, #self.is_windows episode_finish_callback=lambda episode_return, duration, timesteps, env_num: print("episode return {}; steps={}".format( episode_return, timesteps))) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps / 10) # Assume we have learned something. self.assertGreaterEqual(results["mean_episode_reward"], 40.0)
def test_double_dueling_dqn_on_cart_pole(self): """ Creates a double and dueling DQNAgent and runs it via a Runner on the CartPole Env. """ dummy_env = OpenAIGymEnv("CartPole-v0") agent = DQNAgent.from_spec( config_from_path("configs/dqn_agent_for_cartpole.json"), double_q=True, dueling_q=True, state_space=dummy_env.state_space, action_space=dummy_env.action_space, observe_spec=dict(buffer_size=200), execution_spec=dict(seed=156), update_spec=dict(update_interval=4, batch_size=64, sync_interval=16), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=10), agent=agent, render=self.is_windows, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) #print("STATES:\n{}".format(agent.last_q_table["states"])) #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 15) self.assertGreaterEqual(results["max_episode_reward"], 160.0) self.assertLessEqual(results["episodes_executed"], 100)
def test_pong_with_worker(self): env_spec = dict( type="openai", gym_env="PongNoFrameskip-v4", # The frameskip in the agent config will trigger worker skips, this # is used for internal env. frameskip=4, max_num_noops=30, episodic_life=False ) env = OpenAIGymEnv.from_spec(env_spec) agent_config = config_from_path("configs/backend_performance_dqn_pong.json") # Test cpu settings for batching here. agent_config["update_spec"] = None agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=env_spec, agent=agent, frameskip=1, preprocessing_spec=agent_config["preprocessing_spec"], worker_executes_preprocessing=True ) result = worker.execute_timesteps(1000) print(result)
def test_sac_on_pendulum(self): """ Creates an SAC-Agent and runs it on Pendulum. """ env = OpenAIGymEnv("Pendulum-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, # self.is_windows episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs: print("episode: return={} ts={}".format(episode_return, timesteps)) ) # Note: SAC is more computationally expensive. episodes = 50 results = worker.execute_episodes(episodes) print(results) self.assertTrue(results["timesteps_executed"] == episodes * 200) self.assertTrue(results["episodes_executed"] == episodes) self.assertGreater(results["mean_episode_reward_last_10_episodes"], -700) self.assertGreater(results["max_episode_reward"], -100)
def test_dqn_on_pong(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True, visualize=False) agent_config = config_from_path("configs/dqn_agent_for_pong.json") preprocessing_spec = agent_config.pop("preprocessor_spec") agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=self.pong_preprocessed_state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) time_steps = 4000000 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=True, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True) results = worker.execute_timesteps(time_steps, use_exploration=True)
def test_double_dqn_on_2x2_grid_world_single_action_to_container(self): """ Tests how dqn solves a mapping of a single integer to multiple actions (as opposed to using container actions). """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_single_to_container.json") preprocessing_spec = agent_config.pop("preprocessing_spec") action_space = IntBox(0, 18) agent = DQNAgent.from_spec(agent_config, huber_loss=True, double_q=True, dueling_q=True, state_space=FloatBox(shape=(4, )), action_space=action_space, store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results)
def test_actor_critic_on_cart_pole(self): """ Creates an Actor-critic and runs it via a Runner on the CartPole Env. """ env_spec = dict(type="open-ai-gym", gym_env="CartPole-v0", visualize=False) #self.is_windows) dummy_env = OpenAIGymEnv.from_spec(env_spec) agent = ActorCriticAgent.from_spec( config_from_path("configs/actor_critic_agent_for_cartpole.json"), state_space=dummy_env.state_space, action_space=dummy_env.action_space) time_steps = 20000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 20) self.assertGreaterEqual(results["max_episode_reward"], 100.0)
def test_ppo_on_cart_pole(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole Env. """ env = OpenAIGymEnv("CartPole-v0", seed=36) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space ) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) #self.assertGreaterEqual(results["mean_episode_reward"], 23) #self.assertGreaterEqual(results["max_episode_reward"], 100.0) self.assertLessEqual(results["episodes_executed"], time_steps / 10)
def test_double_dqn_on_2x2_grid_world(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld. """ env_spec = dict(world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, dueling_q=False, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=10), update_spec=dict(update_interval=4, batch_size=24, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 1000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 350) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env, "visualize": FLAGS.visualize }) agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(20000, use_exploration=True) # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum # reward. In practice, it would be recommended to stop training when a reward threshold is reached. print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:])))
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) results = worker.execute_episodes(500, use_exploration=True) print(results)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) print(env.state_space) agent = Agent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 5 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(episode_returns), episode_return, np.mean(episode_returns[-5:]) )) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print("Starting workload, this will take some time for the agents to build.") worker.execute_episodes(100, use_exploration=True) # Use exploration is true for training, false for evaluation. #worker.execute_episodes(100, use_exploration=False) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:]) ))
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = MLAgentsEnv() agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) finished_episodes = len(episode_returns) if finished_episodes % 4 == 0: print( "Episode {} finished in {:d}sec: total avg. reward={:.2f}; last 10 episodes={:.2f}; last " "100 episodes={:.2f}".format( finished_episodes, int(duration), np.mean(episode_returns), np.mean(episode_returns[-min(finished_episodes, 10):]), np.mean(episode_returns[-min(finished_episodes, 100):]))) worker = SingleThreadedWorker( env_spec=env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(500000, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:])))
def test_sac_on_cartpole(self): """ Creates an SAC-Agent and runs it on CartPole. """ env = OpenAIGymEnv("CartPole-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) time_steps = 10000 results = worker.execute_timesteps(time_steps) print(results)
def test_double_dueling_dqn_on_cart_pole(self): """ Creates a double and dueling DQNAgent and runs it via a Runner on the CartPole Env. """ gym_env = "CartPole-v0" dummy_env = OpenAIGymEnv(gym_env) config_ = config_from_path("configs/dqn_agent_for_cartpole.json") # Add dueling config to agent. config_["policy_spec"] = { "units_state_value_stream": 3, "action_adapter_spec": { "pre_network_spec": [{ "type": "dense", "units": 3 }] } } agent = DQNAgent.from_spec(config_, double_q=True, dueling_q=True, state_space=dummy_env.state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=13), update_spec=dict(update_interval=4, batch_size=64, sync_interval=16), optimizer_spec=dict(type="adam", learning_rate=0.01), store_last_q_table=True) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv(gym_env, seed=10), agent=agent, render=self.is_windows, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 25) self.assertLessEqual(results["episodes_executed"], 150)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config = read_config_file(FLAGS.config) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) agent = Agent.from_spec( agent_config, summary_spec=dict( summary_regexp=FLAGS.summary_regexp ), state_space=env.state_space, action_space=env.action_space ) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(rewards), reward, np.mean(rewards[-10:]) )) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback ) print("Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:]) ))
def test_sac_learning_on_gaussian_density_as_reward_env(self): """ Creates an SAC-Agent and runs it via a Runner on the GaussianDensityAsRewardEnv. """ env = GaussianDensityAsRewardEnv(episode_length=5) agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_gaussian_density_env.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent) worker.execute_episodes(num_episodes=500) rewards = worker.finished_episode_rewards[0] # 0=1st env in vector-env self.assertTrue(np.mean(rewards[:100]) < np.mean(rewards[-100:])) worker.execute_episodes(num_episodes=100, use_exploration=False, update_spec=None) rewards = worker.finished_episode_rewards[0] self.assertTrue(len(rewards) == 100) evaluation_score = np.mean(rewards) self.assertTrue(.5 * env.get_max_reward() < evaluation_score <= env.get_max_reward())
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, #self.is_windows, episode_finish_callback=lambda episode_return, duration, timesteps, env_num: print("episode return {}; steps={}".format( episode_return, timesteps))) results = worker.execute_episodes(5000, use_exploration=True) print(results)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env}) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(rewards), reward, np.mean(rewards[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:])))
def test_ppo_on_2x2_grid_world_with_container_actions(self): """ Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ----- # |^|H| # ----- # | |G| ^=start, looking up # ----- # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec(agent_config, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space) time_steps = 5000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps) # Assume we have learned something. self.assertGreaterEqual(results["mean_episode_reward"], -2.0)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = MLAgentsEnv() agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(rewards), reward, np.mean(rewards[-10:]))) worker = SingleThreadedWorker( env_spec=env, agent=agent, render=False, worker_executes_preprocessing=False, #synchronous_reset=True, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(100000, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:])))
def test_double_dqn_on_2x2_grid_world_with_container_actions(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec(agent_config, double_q=True, dueling_q=False, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space, execution_spec=dict(seed=15), store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("LAST q-table:\n{}".format(agent.last_q_table)) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -7) self.assertGreaterEqual(results["max_episode_reward"], -1.0) self.assertLessEqual(results["episodes_executed"], time_steps / 3) # Check q-table for correct values. expected_q_values_per_state = { (0., 0., -1., 0.): { "forward": (-5.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., -1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, } for state, q_values_forward, q_values_jump in zip( agent.last_q_table["states"], agent.last_q_table["q_values"]["forward"], agent.last_q_table["q_values"]["jump"]): state, q_values_forward, q_values_jump = tuple(state), tuple( q_values_forward), tuple(q_values_jump) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal( q_values_forward, expected_q_values_per_state[state]["forward"], decimals=0) recursive_assert_almost_equal( q_values_jump, expected_q_values_per_state[state]["jump"], decimals=0)
episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, *args, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 100 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-100:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) # Use exploration is true for training, false for evaluation. worker.execute_episodes(1000, use_exploration=True) def plotting(Baseline, PPO, quit, finished, quitBaseline, finishedBaseline, actionInfo): ax1 = plt.subplot(311) #ax1.set_title('Scenario 4: average reward of last 100 children without quitting penalty') ax1.margins(0.05) #ax1.set_xlabel('Number of children') ax1.set_title('Average reward of last 100 children') ax1.plot(PPO, 'r', label='PPO')