def test_insert_demos(self): """ Tests inserting into the demo memory. """ env = OpenAIGymEnv.from_spec(self.env_spec) agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json") agent = DQFDAgent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) terminals = BoolBox(add_batch_rank=True) rewards = FloatBox(add_batch_rank=True) # Observe a single data point. agent.observe_demos( preprocessed_states=agent.preprocessed_state_space.with_batch_rank().sample(1), actions=env.action_space.with_batch_rank().sample(1), rewards=rewards.sample(1), next_states=agent.preprocessed_state_space.with_batch_rank().sample(1), terminals=terminals.sample(1), ) # Observe a batch of demos. agent.observe_demos( preprocessed_states=agent.preprocessed_state_space.sample(10), actions=env.action_space.sample(10), rewards=FloatBox().sample(10), terminals=terminals.sample(10), next_states=agent.preprocessed_state_space.sample(10) )
def test_impala_on_cart_pole(self): """ Creates a single IMPALAAgent and runs it via a simple loop on CartPole-v0. """ env_spec = dict(type="open-ai-gym", gym_env="CartPole-v0", seed=10, visualize=self.is_windows) config_ = config_from_path("configs/impala_agent_for_cartpole.json") config_["environment_spec"] = env_spec dummy_env = OpenAIGymEnv.from_spec(env_spec) agent = IMPALAAgent.from_spec(config_, state_space=dummy_env.state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=10)) learn_updates = 300 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) mean_returns.append(mean_return) print("i={}/{} Loss={:.4} Avg-reward={:.2}".format( i, learn_updates, float(ret[1]), mean_return)) # Assume we have learned something. average_return_last_n_episodes = np.nanmean(mean_returns[:-100]) print("Average return over last n episodes: {}".format( average_return_last_n_episodes)) self.assertGreater(average_return_last_n_episodes, 30.0) time.sleep(3) agent.terminate() time.sleep(3)
def test_update_online(self): """ Tests if joint updates from demo and online memory work. """ env = OpenAIGymEnv.from_spec(self.env_spec) agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json") agent = DQFDAgent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) terminals = BoolBox(add_batch_rank=True) # Observe a batch of demos. agent.observe_demos( preprocessed_states=agent.preprocessed_state_space.sample(32), actions=env.action_space.sample(32), rewards=FloatBox().sample(32), terminals=terminals.sample(32), next_states=agent.preprocessed_state_space.sample(32) ) # Observe a batch of online data. agent._observe_graph( preprocessed_states=agent.preprocessed_state_space.sample(32), actions=env.action_space.sample(32), rewards=FloatBox().sample(32), internals=[], terminals=terminals.sample(32), next_states=agent.preprocessed_state_space.sample(32) ) # Call update. agent.update()
def test_actor_critic_on_cart_pole(self): """ Creates an Actor-critic and runs it via a Runner on the CartPole Env. """ env_spec = dict(type="open-ai-gym", gym_env="CartPole-v0", visualize=False) #self.is_windows) dummy_env = OpenAIGymEnv.from_spec(env_spec) agent = ActorCriticAgent.from_spec( config_from_path("configs/actor_critic_agent_for_cartpole.json"), state_space=dummy_env.state_space, action_space=dummy_env.action_space) time_steps = 20000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 20) self.assertGreaterEqual(results["max_episode_reward"], 100.0)
def test_pong_with_worker(self): env_spec = dict( type="openai", gym_env="PongNoFrameskip-v4", # The frameskip in the agent config will trigger worker skips, this # is used for internal env. frameskip=4, max_num_noops=30, episodic_life=False ) env = OpenAIGymEnv.from_spec(env_spec) agent_config = config_from_path("configs/backend_performance_dqn_pong.json") # Test cpu settings for batching here. agent_config["update_spec"] = None agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=env_spec, agent=agent, frameskip=1, preprocessing_spec=agent_config["preprocessing_spec"], worker_executes_preprocessing=True ) result = worker.execute_timesteps(1000) print(result)
def test_update_from_demos(self): """ Tests the separate API method to update from demos. """ env = OpenAIGymEnv.from_spec(self.env_spec) agent_config = config_from_path("configs/dqfd_agent_for_cartpole.json") agent = DQFDAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) terminals = BoolBox(add_batch_rank=True) rewards = FloatBox(add_batch_rank=True) state_1 = agent.preprocessed_state_space.with_batch_rank().sample(1) action_1 = [1] state_2 = agent.preprocessed_state_space.with_batch_rank().sample(1) action_2 = [0] # Insert two states with fixed actions and a few random examples. for _ in range(10): # State with correct action agent.observe_demos( preprocessed_states=state_1, actions=action_1, rewards=rewards.sample(1), next_states=agent.preprocessed_state_space.with_batch_rank(). sample(1), terminals=terminals.sample(1), ) agent.observe_demos( preprocessed_states=state_2, actions=action_2, rewards=rewards.sample(1), next_states=agent.preprocessed_state_space.with_batch_rank(). sample(1), terminals=terminals.sample(1), ) # Update. agent.update_from_demos(num_updates=100, batch_size=8) # Test if fixed states and actions map. action = agent.get_action(states=state_1, apply_preprocessing=False, use_exploration=False) self.assertEqual(action, action_1) action = agent.get_action(states=state_2, apply_preprocessing=False, use_exploration=False) self.assertEqual(action, action_2)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env, "visualize": FLAGS.visualize }) agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(episode_returns), episode_return, np.mean(episode_returns[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") # Use exploration is true for training, false for evaluation. worker.execute_timesteps(20000, use_exploration=True) # Note: A basic actor critic is very sensitive to hyper-parameters and might collapse after reaching the maximum # reward. In practice, it would be recommended to stop training when a reward threshold is reached. print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:])))
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) print(env.state_space) agent = Agent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 5 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(episode_returns), episode_return, np.mean(episode_returns[-5:]) )) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print("Starting workload, this will take some time for the agents to build.") worker.execute_episodes(100, use_exploration=True) # Use exploration is true for training, false for evaluation. #worker.execute_episodes(100, use_exploration=False) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(episode_returns), np.mean(episode_returns[-10:]) ))
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) # Override openAI gym env per command line. if FLAGS.env is None: env_spec = agent_config["environment_spec"] else: env_spec = dict(type="openai-gym", gym_env=FLAGS.env) # Override number of visualized envs per command line. if FLAGS.visualize != -1: env_spec["visualize"] = FLAGS.visualize dummy_env = OpenAIGymEnv.from_spec(env_spec) agent = Agent.from_spec( agent_config, state_space=dummy_env.state_space, action_space=dummy_env.action_space ) dummy_env.terminate() learn_updates = 6000 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = _calc_mean_return(ret) mean_returns.append(mean_return) print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return)) print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format( np.nanmean(mean_returns), np.nanmean(mean_returns[-10:]) )) time.sleep(1) agent.terminate() time.sleep(3)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config = read_config_file(FLAGS.config) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": FLAGS.env }) agent = Agent.from_spec( agent_config, summary_spec=dict( summary_regexp=FLAGS.summary_regexp ), state_space=env.state_space, action_space=env.action_space ) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".format( len(rewards), reward, np.mean(rewards[-10:]) )) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=FLAGS.render, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback ) print("Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:]) ))
def evaluate(agent_obs, nChildren): envObs = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": 'gym_SmartPrimer:TestEnv-v0' }) improvements = [] for i in range(0, nChildren): ob_obs = envObs.reset() ob_obs = (ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0 ]) / [9, 1, 10, 1, 1, 3, 50, 1] # ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10] # action_list_obs = [] while True: time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0) action = agent_obs.get_action(ob_obs, time_percentage=time_percentage_obs) # action = np.random.randint(0, 4) # action = 3 # action_list_obs.append(action) next_ob_obs, reward, done, Baseinfo = envObs.step(action) next_ob_obs = (next_ob_obs - [-4.5, 0, -5, 0, 0, 1.5, 25, 0 ]) / [9, 1, 10, 1, 1, 3, 50, 1] # next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5]) / [8, 4, 1, 1, 1, 3, 30, 10] # agent_obs.observe(ob_obs, action, None, reward, next_ob_obs, done) ob_obs = next_ob_obs if done: # print(envObs.gym_env.rewards) improvements.append(envObs.gym_env.rewards) agent_obs.reset() break return np.mean(improvements), np.std(improvements)
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({"type": "openai", "gym_env": FLAGS.env}) agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) rewards = [] def episode_finished_callback(reward, duration, timesteps, **kwargs): rewards.append(reward) if len(rewards) % 10 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.". format(len(rewards), reward, np.mean(rewards[-10:]))) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, render=False, worker_executes_preprocessing=False, episode_finish_callback=episode_finished_callback) print( "Starting workload, this will take some time for the agents to build.") results = worker.execute_episodes(200, use_exploration=True) print("Mean reward: {:.2f} / over the last 10 episodes: {:.2f}".format( np.mean(rewards), np.mean(rewards[-10:])))
def evaluate(agent_obs, nChildren): envObs = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": 'gym_SmartPrimer:SmartPrimer-realistic-v2' }) improvements = [] for i in range(0, nChildren): ob_obs = envObs.reset() ob_obs = (ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5 ]) / [8, 4, 1, 1, 1, 3, 30, 10] action_list_obs = [] while True: time_percentage_obs = min(agent_obs.timesteps / 1e6, 1.0) action = agent_obs.get_action(ob_obs, time_percentage=time_percentage_obs) # action = np.random.randint(0, 4) # action = 3 action_list_obs.append(action) next_ob_obs, reward, done, Baseinfo = envObs.step(action) next_ob_obs = (next_ob_obs - [4, 4, 0.5, 0.5, 0.5, 1.5, 15, 5 ]) / [8, 4, 1, 1, 1, 3, 30, 10] ob_obs = next_ob_obs if done: improvements.append(envObs.gym_env.info['improvementPerChild']) agent_obs.reset() break return np.mean(improvements), np.std(improvements)
def test_with_final_eval(self): """ Tests if apex can learn a simple environment using a single worker, thus replicating DQN. """ env_spec = dict(type="openai", gym_env="CartPole-v0") agent_config = config_from_path("configs/apex_agent_cartpole.json") # Use n-step adjustments. agent_config["execution_spec"]["ray_spec"]["worker_spec"][ "n_step_adjustment"] = 3 agent_config["execution_spec"]["ray_spec"]["apex_replay_spec"][ "n_step_adjustment"] = 3 agent_config["n_step"] = 3 executor = ApexExecutor( environment_spec=env_spec, agent_config=agent_config, ) # Define executor, test assembly. print("Successfully created executor.") # Executes actual workload. result = executor.execute_workload( workload=dict(num_timesteps=20000, report_interval=1000, report_interval_min_seconds=1)) print("Finished executing workload:") print(result) # Get agent. agent = executor.local_agent preprocessing_spec = agent_config["preprocessing_spec"] # Create env. env = OpenAIGymEnv.from_spec(env_spec) if preprocessing_spec is not None: preprocessing_spec = deepcopy(preprocessing_spec) in_space = env.state_space.with_batch_rank() in_space = deepcopy(in_space) # Set scopes. scopes = [ preprocessor["scope"] for preprocessor in preprocessing_spec ] # Set backend to python. for spec in preprocessing_spec: spec["backend"] = "python" processor_stack = PreprocessorStack(*preprocessing_spec, backend="python") build_space = in_space for sub_comp_scope in scopes: processor_stack.sub_components[ sub_comp_scope].create_variables( input_spaces=dict(preprocessing_inputs=build_space), action_space=None) build_space = processor_stack.sub_components[ sub_comp_scope].get_preprocessed_space(build_space) processor_stack.reset() else: processor_stack = None ep_rewards = [] print("finished learning, starting eval") for _ in range(10): state = env.reset() terminal = False ep_reward = 0 while not terminal: state = agent.state_space.force_batch(state) if processor_stack is not None: state = processor_stack.preprocess(state) actions = agent.get_action(states=state, use_exploration=False, apply_preprocessing=False) next_state, step_reward, terminal, info = env.step( actions=actions[0]) ep_reward += step_reward state = next_state if terminal: ep_rewards.append(ep_reward) break print("Eval episode rewards:") print(ep_rewards)
ob, reward, done, Baseinfo = env.step(action) if done: agent.reset() break np.random.seed(2) agent_config_path = os.path.abspath(os.path.dirname( os.path.dirname(__file__))) + '/agents/ppoSmartPrimer_config.json' with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": 'gym_SmartPrimer:SmartPrimer-realistic-v2' }) agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) episode_returns = [] def episode_finished_callback(episode_return, duration, timesteps, *args, **kwargs): episode_returns.append(episode_return) if len(episode_returns) % 100 == 0: print("Episode {} finished: reward={:.2f}, average reward={:.2f}.".
import json import os import pickle import argparse import copy import matplotlib.pyplot as plt parser = argparse.ArgumentParser(description='example') parser.add_argument('--seed', type=int, default=3, help='numpy seed ') parser.add_argument('--time', type=int, default=5, help='numpy seed ') args = parser.parse_args() np.random.seed(args.seed) env = OpenAIGymEnv.from_spec({ "type": "openai", "gym_env": 'gym_SmartPrimer:TestEnv-v0' }) # configure the agent settings in this file agent_config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'agents/ppoSmartPrimer_config.json') with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) # retreive the agent from RLgraph agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) # define number of children to simulate