def test_sac_on_pendulum(self): """ Creates an SAC-Agent and runs it on Pendulum. """ env = OpenAIGymEnv("Pendulum-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows ) # Note: SAC is more computationally expensive. episodes = 50 results = worker.execute_episodes(episodes) print(results) self.assertTrue(results["timesteps_executed"] == episodes * 200) self.assertTrue(results["episodes_executed"] == episodes) self.assertGreater(results["mean_episode_reward"], -800)
def test_sac_on_pendulum(self): """ Creates an SAC-Agent and runs it on Pendulum. """ env = OpenAIGymEnv("Pendulum-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, # self.is_windows episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs: print("episode: return={} ts={}".format(episode_return, timesteps)) ) # Note: SAC is more computationally expensive. episodes = 50 results = worker.execute_episodes(episodes) print(results) self.assertTrue(results["timesteps_executed"] == episodes * 200) self.assertTrue(results["episodes_executed"] == episodes) self.assertGreater(results["mean_episode_reward_last_10_episodes"], -700) self.assertGreater(results["max_episode_reward"], -100)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) print(env.state_space) agent = Agent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 5 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(episode_returns), episode_return, np.mean(episode_returns[-5:]) )) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print("Starting workload, this will take some time for the agents to build.") worker.execute_episodes(100, use_exploration=True) # Use exploration is true for training, false for evaluation. #worker.execute_episodes(100, use_exploration=False) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:]) ))
def test_sac_learning_on_gaussian_density_as_reward_env(self): """ Creates an SAC-Agent and runs it via a Runner on the GaussianDensityAsRewardEnv. """ env = GaussianDensityAsRewardEnv(episode_length=5) agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_gaussian_density_env.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent) worker.execute_episodes(num_episodes=500) rewards = worker.finished_episode_rewards[0] # 0=1st env in vector-env self.assertTrue(np.mean(rewards[:100]) < np.mean(rewards[-100:])) worker.execute_episodes(num_episodes=100, use_exploration=False, update_spec=None) rewards = worker.finished_episode_rewards[0] self.assertTrue(len(rewards) == 100) evaluation_score = np.mean(rewards) self.assertTrue(.5 * env.get_max_reward() < evaluation_score <= env.get_max_reward())
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) results = worker.execute_episodes(500, use_exploration=True) print(results)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config = read_config_file(FLAGS.config) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) agent = Agent.from_spec( agent_config, summary_spec=dict( summary_regexp=FLAGS.summary_regexp ), state_space=env.state_space, action_space=env.action_space ) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(rewards), reward, np.mean(rewards[-10:]) )) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback ) print("Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:]) ))
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, #self.is_windows, episode_finish_callback=lambda episode_return, duration, timesteps, env_num: print("episode return {}; steps={}".format( episode_return, timesteps))) results = worker.execute_episodes(5000, use_exploration=True) print(results)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env}) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(rewards), reward, np.mean(rewards[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:])))
episode_returns.append(episode_return) if len(episode_returns) % 100 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-100:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) # Use exploration is true for training, false for evaluation. worker.execute_episodes(1000, use_exploration=True) def plotting(Baseline, PPO, quit, finished, quitBaseline, finishedBaseline, actionInfo): ax1 = plt.subplot(311) #ax1.set_title('Scenario 4: average reward of last 100 children without quitting penalty') ax1.margins(0.05) #ax1.set_xlabel('Number of children') ax1.set_title('Average reward of last 100 children') ax1.plot(PPO, 'r', label='PPO') ax1.plot(Baseline, 'b', label='Baseline') ax1.legend() ax4 = plt.subplot(312) ax4.set_title('Actions taken')
action_space=env.action_space) #define number of children to simulate episode_count = 1000 episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, *args, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 100 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-100:]))) # create the worker worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) # Use exploration is true for training, false for evaluation. worker.execute_episodes(episode_count, use_exploration=True) #make the plots env.gym_env.render()