def test_impala_on_cart_pole(self): """ Creates a single IMPALAAgent and runs it via a simple loop on CartPole-v0. """ env_spec = dict(type="open-ai-gym", gym_env="CartPole-v0", seed=10, visualize=self.is_windows) config_ = config_from_path("configs/impala_agent_for_cartpole.json") config_["environment_spec"] = env_spec dummy_env = OpenAIGymEnv.from_spec(env_spec) agent = IMPALAAgent.from_spec(config_, state_space=dummy_env.state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=10)) learn_updates = 300 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) mean_returns.append(mean_return) print("i={}/{} Loss={:.4} Avg-reward={:.2}".format( i, learn_updates, float(ret[1]), mean_return)) # Assume we have learned something. average_return_last_n_episodes = np.nanmean(mean_returns[:-100]) print("Average return over last n episodes: {}".format( average_return_last_n_episodes)) self.assertGreater(average_return_last_n_episodes, 30.0) time.sleep(3) agent.terminate() time.sleep(3)
def test_impala_on_outbreak(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Breakout-v0", frameskip=4, max_num_noops=30, episodic_life=True, visualize=False) config_ = config_from_path("configs/impala_agent_for_breakout.json") agent = IMPALAAgent.from_spec( config_, state_space=env.state_space, action_space=env.action_space, ) learn_updates = 4000000 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) mean_returns.append(mean_return) print("i={} Loss={:.4} Avg-reward={:.2}".format( i, float(ret[1]), mean_return)) time.sleep(3) agent.terminate() time.sleep(3)
def test_impala_actor_compilation(self): """ Tests IMPALA agent compilation (actor). """ try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json") env = DeepmindLabEnv( level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4 ) actor_agent = IMPALAAgent.from_spec( agent_config, type="actor", state_space=env.state_space, action_space=env.action_space, internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True), # Make session-creation hang in docker. execution_spec=dict(disable_monitoring=True) ) # Start Specifiable Server with Env manually. actor_agent.environment_stepper.environment_server.start() print("Compiled IMPALA type=actor agent.") actor_agent.environment_stepper.environment_server.stop()
def test_impala_learner_compilation(self): """ Tests IMPALA agent compilation (learner). """ return if get_backend() == "pytorch": return try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path( "configs/impala_agent_for_deepmind_lab_env.json") env_spec = dict(level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = DeepmindLabEnv.from_spec(env_spec) learner_agent = IMPALAAgent.from_spec( agent_config, type="learner", state_space=dummy_env.state_space, action_space=dummy_env.action_space, internal_states_space=IMPALAAgent.default_internal_states_space, environment_spec=default_dict(dict(type="deepmind-lab"), env_spec), # Setup distributed tf. execution_spec=dict( mode="distributed", #gpu_spec=dict( # gpus_enabled=True, # max_usable_gpus=1, # num_gpus=1 #), distributed_spec=dict(job="learner", task_index=0, cluster_spec=self.impala_cluster_spec), session_config=dict(type="monitored-training-session", allow_soft_placement=True, log_device_placement=True, auto_start=False), disable_monitoring=True, enable_timeline=True, )) print( "Compiled IMPALA type=learner agent without starting the session (would block waiting for actor)." ) ## Take one batch from the filled up queue and run an update_from_memory with the learner. #update_steps = 10 #time_start = time.perf_counter() #for _ in range(update_steps): # agent.call_api_method("update_from_memory") #time_total = time.perf_counter() - time_start #print("Done learning {}xbatch-of-{} in {}sec ({} updates/sec).".format( # update_steps, agent.update_spec["batch_size"], time_total , update_steps / time_total) #) learner_agent.terminate()
def test_impala_single_agent_compilation(self): """ Tests IMPALA agent compilation (single-node mode). """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, update_spec=dict(batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05)) agent.terminate() print("Compiled IMPALA type=single agent.")
def test_impala_single_agent_compilation(self): """ Tests IMPALA agent compilation (single-node mode). """ return if get_backend() == "pytorch": return env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, update_spec=dict(batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05), # Make session-creation hang in docker. execution_spec=dict(disable_monitoring=True)) agent.terminate() print("Compiled {}".format(agent))
def test_impala_on_2x2_grid_world(self): """ Creates a single IMPALAAgent and runs it via a simple loop on a 2x2 GridWorld. """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, execution_spec=dict(seed=12), update_spec=dict(batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05)) learn_updates = 50 for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) print("i={} Loss={:.4} Avg-reward={:.2}".format( i, float(ret[1]), mean_return)) # Assume we have learned something. self.assertGreater(mean_return, -0.1) # Check the last action probs for the 2 valid next_states (start (after a reset) and one below start). action_probs = ret[3]["action_probs"].reshape((80, 4)) next_states = ret[3]["states"][:, 1:].reshape((80, )) for s_, probs in zip(next_states, action_probs): # Start state: # - Assume we picked "right" in state=1 (in order to step into goal state). # - OR we picked "up" or "left" in state=0 (unlikely, but possible). if s_ == 0: recursive_assert_almost_equal(probs[0], 0.0, decimals=2) self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99) recursive_assert_almost_equal(probs[3], 0.0, decimals=2) # One below start: # - Assume we picked "down" in start state with very large probability. # - OR we picked "left" or "down" in state=1 (unlikely, but possible). elif s_ == 1: recursive_assert_almost_equal(probs[0], 0.0, decimals=2) self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99) recursive_assert_almost_equal(probs[3], 0.0, decimals=2) agent.terminate()
def test_impala_on_2x2_grid_world(self): """ Creates a single IMPALAAgent and runs it via the IMPALAWorker on a simple 2x2 GridWorld. """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, execution_spec=dict(seed=12), update_spec=dict(update_interval=4, batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05), ) learn_updates = 1000 # Setup the queue runner. agent.call_api_method("setup_queue_runner") for _ in range(learn_updates): agent.update() #print("STATES:\n{}".format(agent.last_q_table["states"])) #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) #self.assertEqual(results["timesteps_executed"], time_steps) #self.assertEqual(results["env_frames"], time_steps) #self.assertGreaterEqual(results["mean_episode_reward"], -3.5) #self.assertGreaterEqual(results["max_episode_reward"], 0.0) #self.assertLessEqual(results["episodes_executed"], 350) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_impala_actor_compilation(self): """ Tests IMPALA agent compilation (actor). """ return if get_backend() == "pytorch": return try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json") env_spec = dict(level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = DeepmindLabEnv.from_spec(env_spec) agent = IMPALAAgent.from_spec( agent_config, type="actor", state_space=dummy_env.state_space, action_space=dummy_env.action_space, internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=False), environment_spec=default_dict(dict(type="deepmind-lab"), env_spec), # Make session-creation hang in docker. execution_spec=dict( session_config=dict( type="monitored-training-session", auto_start=False ), disable_monitoring=True ) ) # Start Specifiable Server with Env manually (monitoring is disabled). agent.environment_stepper.environment_server.start_server() print("Compiled {}".format(agent)) agent.environment_stepper.environment_server.stop_server() agent.terminate()
def test_impala_learner_compilation(self): """ Tests IMPALA agent compilation (learner). """ try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json") env = DeepmindLabEnv( level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4 ) learner_agent = IMPALAAgent.from_spec( agent_config, type="learner", state_space=env.state_space, action_space=env.action_space, internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True), ) print("Compiled IMPALA type=learner agent.")
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) cluster_spec_config_path = os.path.join(os.getcwd(), FLAGS.cluster_spec) with open(cluster_spec_config_path, 'rt') as fp: cluster_spec = json.load(fp) # Environment options env_spec = { "type": "deepmind-lab", "level_id": FLAGS.level, "frameskip": 4, "observations": ["RGB_INTERLEAVED", "INSTR"] } # Verbose usage errors if FLAGS.actor and FLAGS.learner: print("Please only use either --actor or --learner, not both.") sys.exit(1) # We dynamically update the distributed spec according to the job and task index if FLAGS.actor: agent_type = 'actor' distributed_spec = dict( job='actor', task_index=FLAGS.task, cluster_spec=cluster_spec ) # Actors should only act on CPUs agent_config['execution_spec']['gpu_spec'].update({ "gpus_enabled": False, "max_usable_gpus": 0, "num_gpus": 0 }) elif FLAGS.learner: agent_type = 'learner' distributed_spec = dict( job='learner', task_index=FLAGS.task, cluster_spec=cluster_spec ) else: print("Please pass either --learner or --actor (or look at the CartPole example for single trainig mode).") sys.exit(1) # Set the sample size for the workers worker_sample_size = 100 # Since we dynamically update the cluster spec according to the job and task index, we need to # manually update the execution spec as well. execution_spec = agent_config['execution_spec'] execution_spec.update(dict( mode="distributed", distributed_spec=distributed_spec )) # Now, create the environment env = Environment.from_spec(env_spec) agent_spec = dict( type=agent_type, architecture="large", environment_spec=env_spec, worker_sample_size=worker_sample_size, state_space=env.state_space, action_space=env.action_space, # TODO: automate this (by lookup from NN). internal_states_space=IMPALAAgent.default_internal_states_space, execution_spec=execution_spec, # update_spec=dict(batch_size=2), # Summarize time-steps to have an overview of the env-stepping speed. summary_spec=dict(summary_regexp="time-step", directory="/tmp/impala_{}_{}/".format(agent_type, FLAGS.task)) ) agent_config.update(agent_spec) agent = IMPALAAgent( **agent_config ) if FLAGS.learner: print("Starting learner for {} updates.".format(FLAGS.updates)) for _ in range(FLAGS.updates): start_time = time.perf_counter() results = agent.update() else: # Actor just acts print("Starting actor. Terminate with SIGINT (Ctrl+C).") while True: agent.call_api_method("perform_n_steps_and_insert_into_fifo") #.monitored_session.run([agent.enqueue_op]) learn_updates = 100 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = _calc_mean_return(ret) mean_returns.append(mean_return) print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return)) print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format( np.nanmean(mean_returns), np.nanmean(mean_returns[-10:]) )) time.sleep(1) agent.terminate() time.sleep(1)