def test_td3_exploration_and_with_random_prerun(self): """Tests TD3's Exploration (w/ random actions for n timesteps).""" config = td3.TD3_DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["num_gpus"] = 1 obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. for _ in framework_iterator(config): lcl_config = config.copy() # Default GaussianNoise setup. trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0") # Setting explore=False should always return the same action. a_ = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, 1) for i in range(50): a = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, i + 2) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for i in range(50): actions.append(trainer.compute_action(obs)) self.assertEqual(trainer.get_policy().global_timestep, i + 52) check(np.std(actions), 0.0, false=True) trainer.stop() # Check randomness at beginning. lcl_config["exploration_config"] = { # Act randomly at beginning ... "random_timesteps": 30, # Then act very closely to deterministic actions thereafter. "stddev": 0.001, "initial_scale": 0.001, "final_scale": 0.001, } trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0") # ts=0 (get a deterministic action as per explore=False). deterministic_action = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, 1) # ts=1-29 (in random window). random_a = [] for i in range(1, 30): random_a.append(trainer.compute_action(obs, explore=True)) self.assertEqual(trainer.get_policy().global_timestep, i + 1) check(random_a[-1], deterministic_action, false=True) self.assertTrue(np.std(random_a) > 0.5) # ts > 30 (a=deterministic_action + scale * N[0,1]) for i in range(50): a = trainer.compute_action(obs, explore=True) self.assertEqual(trainer.get_policy().global_timestep, i + 31) check(a, deterministic_action, rtol=0.1) # ts >> 30 (BUT: explore=False -> expect deterministic action). for i in range(50): a = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, i + 81) check(a, deterministic_action) trainer.stop()
def test_td3_exploration_and_with_random_prerun(self): """Tests TD3's Exploration (w/ random actions for n timesteps).""" config = td3.TD3_DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. for fw in ["tf", "eager", "torch"]: if fw != "tf": continue config["eager"] = True if fw == "eager" else False config["use_pytorch"] = True if fw == "torch" else False # Default GaussianNoise setup. trainer = td3.TD3Trainer(config=config, env="Pendulum-v0") # Setting explore=False should always return the same action. a_ = trainer.compute_action(obs, explore=False) for _ in range(50): a = trainer.compute_action(obs, explore=False) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for _ in range(50): actions.append(trainer.compute_action(obs)) check(np.std(actions), 0.0, false=True) # Check randomness at beginning. config["exploration_config"] = { # Act randomly at beginning ... "random_timesteps": 30, # Then act very closely to deterministic actions thereafter. "stddev": 0.001, "initial_scale": 0.001, "final_scale": 0.001, } trainer = td3.TD3Trainer(config=config, env="Pendulum-v0") # ts=1 (get a deterministic action as per explore=False). deterministic_action = trainer.compute_action(obs, explore=False) # ts=2-5 (in random window). random_a = [] for _ in range(29): random_a.append(trainer.compute_action(obs, explore=True)) check(random_a[-1], deterministic_action, false=True) self.assertTrue(np.std(random_a) > 0.5) # ts > 30 (a=deterministic_action + scale * N[0,1]) for _ in range(50): a = trainer.compute_action(obs, explore=True) check(a, deterministic_action, rtol=0.1) # ts >> 30 (BUT: explore=False -> expect deterministic action). for _ in range(50): a = trainer.compute_action(obs, explore=False) check(a, deterministic_action)
def test_td3_compilation(self): """Test whether a TD3Trainer can be built with both frameworks.""" config = td3.TD3_DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Test against all frameworks. for _ in framework_iterator(config, frameworks=["tf"]): trainer = td3.TD3Trainer(config=config, env="Pendulum-v0") num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results)
def test_td3_compilation(self): """Test whether a TD3Trainer can be built with both frameworks.""" config = td3.TD3_DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Test against all frameworks. for _ in framework_iterator(config, with_eager_tracing=True): trainer = td3.TD3Trainer(config=config, env="Pendulum-v1") num_iterations = 1 for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) trainer.stop()
def test_td3_compilation(self): """Test whether a TD3Trainer can be built with both frameworks.""" config = td3.TD3_DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Test against all frameworks. for fw in ["tf", "eager", "torch"]: if fw != "tf": continue config["eager"] = True if fw == "eager" else False config["use_pytorch"] = True if fw == "torch" else False trainer = td3.TD3Trainer(config=config, env="Pendulum-v0") num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results)
def test_td3_fake_multi_gpu_learning(self): """Test whether TD3Trainer can run SimpleEnv w/ faked multi-GPU.""" config = td3.TD3_DEFAULT_CONFIG.copy() # Fake GPU setup. config["num_gpus"] = 2 config["_fake_gpus"] = True env = "ray.rllib.agents.sac.tests.test_sac.SimpleEnv" config["env_config"] = {"config": {"repeat_delay": 0}} for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = td3.TD3Trainer(config=config, env=env) num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results) trainer.stop()
ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2) if algorithm == 'A2C': RLAgent = a2c.A2CTrainer(env=env_name, config=config) elif algorithm == 'APEX_DDPG': RLAgent = apex.ApexDDPGTrainer(env=env_name, config=config) elif algorithm == 'DDPG': RLAgent = ddpg.DDPGTrainer(env=env_name, config=config) elif algorithm == 'IMPALA': RLAgent = impala.ImpalaTrainer(env=env_name, config=config) elif algorithm == 'PPO': RLAgent = ppo.PPOTrainer(env=env_name, config=config) elif algorithm == 'SAC': RLAgent = sac.SACTrainer(env=env_name, config=config) elif algorithm == 'TD3': RLAgent = td3.TD3Trainer(env=env_name, config=config) RLAgent.restore(checkpoint_path) num_runs = 50 totalRewards = np.empty((num_runs, )) for j in range(num_runs): observations = env.reset() rewards, action_dict = {}, {} for agent_id in env.agent_ids: assert isinstance(agent_id, int), "Error: agent_ids are not ints." action_dict = dict( zip(env.agent_ids, [env.action_space_dict[i].sample() for i in env.agent_ids])) rewards[agent_id] = 0
def render(checkpoint, home_path): """ Renders pybullet and mujoco environments. """ alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0) current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0) checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint) config = json.load(open(home_path + "params.json")) config_bin = pickle.load(open(home_path + "params.pkl", "rb")) ray.shutdown() import pybullet_envs ray.init() ModelCatalog.register_custom_model("RBF", RBFModel) ModelCatalog.register_custom_model("MLP_2_64", MLP) ModelCatalog.register_custom_model("linear", Linear) if alg == "PPO": trainer = ppo.PPOTrainer(config_bin) if alg == "SAC": trainer = sac.SACTrainer(config) if alg == "DDPG": trainer = ddpg.DDPGTrainer(config) if alg == "PG": trainer = pg.PGTrainer(config) if alg == "A3C": trainer = a3c.A3CTrainer(config) if alg == "TD3": trainer = td3.TD3Trainer(config) if alg == "ES": trainer = es.ESTrainer(config) if alg == "ARS": trainer = ars.ARSTrainer(config) # "normalize_actions": true, trainer.restore(checkpoint_path) if "Bullet" in current_env: env = gym.make(current_env, render=True) else: env = gym.make(current_env) #env.unwrapped.reset_model = det_reset_model env._max_episode_steps = 10000 obs = env.reset() action_hist = [] m_act_hist = [] state_hist = [] obs_hist = [] reward_hist = [] done = False step = 0 for t in range(10000): # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now # mean_actions = out_dict['behaviour_logits'][:17] # actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) sampled_actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) actions = sampled_actions obs, reward, done, _ = env.step(np.asarray(actions)) # env.camera_adjust() env.render(mode='human') time.sleep(0.01) # env.render() # env.render(mode='rgb_array', close = True) # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0]) # if step % 1000 == 0: # env.reset() # step += 1 action_hist.append(np.copy(actions)) obs_hist.append(np.copy(obs)) reward_hist.append(np.copy(reward)) if done: obs = env.reset() # print(sum(reward_hist)) # print((obs_hist)) #plt.plot(action_hist) #plt.figure() #plt.figure() #plt.plot(obs_hist) #plt.figure() # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit) # trainer.compute_action(obs, full_fetch=True) trainer.compute_action(obs)
# General options config = td3.TD3_DEFAULT_CONFIG.copy() config["framework"] = "tf" config["eager_tracing"] = True config["log_level"] = logging.DEBUG if DEBUG else logging.ERROR config["seed"] = SEED # Environment options config["horizon"] = 1000 # TD3-specific options config["learning_starts"] = 10000 config["exploration_config"]["random_timesteps"] = 20000 # ====================== Run the optimization ====================== train_agent = td3.TD3Trainer(config, "env", logger_creator) checkpoint_path = train(train_agent, max_timesteps=1000000) # ===================== Enjoy a trained agent ====================== test_agent = td3.TD3Trainer(config, "env") test_agent.restore(checkpoint_path) test(test_agent, explore=False) # =================== Terminate the Ray backend ==================== train_agent.stop() test_agent.stop() ray.shutdown()