def test_dqn_compilation(self): """Test whether a DQNTrainer can be built with both frameworks.""" config = dqn.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. num_iterations = 2 for _ in framework_iterator(config, frameworks=["tf", "eager"]): # Rainbow. rainbow_config = config.copy() rainbow_config["num_atoms"] = 10 rainbow_config["noisy"] = True rainbow_config["double_q"] = True rainbow_config["dueling"] = True rainbow_config["n_step"] = 5 trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results) # double-dueling DQN. plain_config = config.copy() trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results)
def test_dqn_compilation(self): """Test whether a DQNTrainer can be built on all frameworks.""" config = dqn.DEFAULT_CONFIG.copy() config["num_workers"] = 2 num_iterations = 1 for _ in framework_iterator(config): # Double-dueling DQN. print("Double-dueling") plain_config = config.copy() trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) trainer.stop() # Rainbow. print("Rainbow") rainbow_config = config.copy() rainbow_config["num_atoms"] = 10 rainbow_config["noisy"] = True rainbow_config["double_q"] = True rainbow_config["dueling"] = True rainbow_config["n_step"] = 5 trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) trainer.stop()
def test_dqn_compilation(self): """Test whether a DQNTrainer can be built on all frameworks.""" config = dqn.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. num_iterations = 2 for fw in framework_iterator(config): # double-dueling DQN. plain_config = config.copy() trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results) # Rainbow. # TODO(sven): Add torch once DQN-torch supports distributional-Q. if fw == "torch": continue rainbow_config = config.copy() rainbow_config["num_atoms"] = 10 rainbow_config["noisy"] = True rainbow_config["double_q"] = True rainbow_config["dueling"] = True rainbow_config["n_step"] = 5 trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results)
def test_dqn_compilation(self): """Test whether a DQNTrainer can be built on all frameworks.""" num_iterations = 1 config = dqn.dqn.DQNConfig().rollouts(num_rollout_workers=2) for _ in framework_iterator(config, with_eager_tracing=True): # Double-dueling DQN. print("Double-dueling") plain_config = deepcopy(config) trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) trainer.stop() # Rainbow. print("Rainbow") rainbow_config = deepcopy(config).training(num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5) trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) trainer.stop()
def main(params): for (key, value) in params.items(): print("Parameter {} is set to {}".format(key, value)) if not params["use_gym_env"]: register_env(params["env_name"], get_env_creator(params["env_name"])) if params["model"] == "DQN": print(3) from ray.rllib.agents import dqn ray.init() print(4) config = dqn.DEFAULT_CONFIG.copy() config["framework"] = params["framework"] env = str(params["env_name"]) print(5) trainer = dqn.DQNTrainer(config=config, env=env) print(6) for i in range(100): print(trainer.train()['episode_reward_mean']) if params["model"] == "PPO": ray.init() ModelCatalog.register_custom_model("my_model", TorchCustomModel) trainer = get_trainer_from_params(params) if params["train"]: for i in range(params['num_training_iters']): print("starting training iteration {}".format(i)) trainer.train() if i == params['num_training_iters'] - 1: checkpoint_path = trainer.save() print(checkpoint_path)
def test_evaluation_option(self): config = dqn.DEFAULT_CONFIG.copy() config.update({ "env": "CartPole-v0", "evaluation_interval": 2, "evaluation_num_episodes": 2, "evaluation_config": { "gamma": 0.98, }, # Use a custom callback that asserts that we are running the # configured exact number of episodes per evaluation. "callbacks": AssertNumEvalEpisodesCallback, }) for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = dqn.DQNTrainer(config=config) # Given evaluation_interval=2, r0, r2, r4 should not contain # evaluation metrics, while r1, r3 should. r0 = trainer.train() print(r0) r1 = trainer.train() print(r1) r2 = trainer.train() print(r2) r3 = trainer.train() print(r3) trainer.stop() self.assertFalse("evaluation" in r0) self.assertTrue("evaluation" in r1) self.assertFalse("evaluation" in r2) self.assertTrue("evaluation" in r3) self.assertTrue("episode_reward_mean" in r1["evaluation"]) self.assertNotEqual(r1["evaluation"], r3["evaluation"])
def create_agent(args): """Create DQN agent. Args: args (argparse.Namespace): argparse arguments. Returns: agent (ray.rllib.agents.trainer_template.DQN): DQN agent. """ # Custom configuration config = dqn.DEFAULT_CONFIG.copy() config["double_q"] = True config["dueling"] = True config["framework"] = "torch" config["horizon"] = 1150 config["num_gpus"] = 1 config["num_workers"] = 19 config["train_batch_size"] = 128 # Agent creation agent = dqn.DQNTrainer(env=MissileCommand, config=config) # To optionally load a checkpoint if args.checkpoint: agent.restore(args.checkpoint) # Print model if args.verbose > 0: model = agent.get_policy().model if config["framework"] == "tf": print(type(model.base_model.summary())) elif config["framework"] == "torch": print(model) return agent
def test_traj_view_normal_case(self): """Tests, whether Model and Policy return the correct ViewRequirements. """ config = dqn.DEFAULT_CONFIG.copy() for _ in framework_iterator(config): trainer = dqn.DQNTrainer( config, env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv") policy = trainer.get_policy() view_req_model = policy.model.inference_view_requirements view_req_policy = policy.view_requirements assert len(view_req_model) == 1, view_req_model assert len(view_req_policy) == 8, view_req_policy for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS, SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, "weights", ]: assert key in view_req_policy # None of the view cols has a special underlying data_col, # except next-obs. if key != SampleBatch.NEXT_OBS: assert view_req_policy[key].data_col is None else: assert view_req_policy[key].data_col == SampleBatch.OBS assert view_req_policy[key].shift == 1 trainer.stop()
def test_evaluation_option_always_attach_eval_metrics(self): config = dqn.DEFAULT_CONFIG.copy() config.update( { "env": "CartPole-v0", "evaluation_interval": 2, "evaluation_duration": 2, "evaluation_duration_unit": "episodes", "evaluation_config": { "gamma": 0.98, }, "always_attach_evaluation_results": True, # Use a custom callback that asserts that we are running the # configured exact number of episodes per evaluation. "callbacks": AssertEvalCallback, } ) for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = dqn.DQNTrainer(config=config) # Should always see latest available eval results. r0 = trainer.train() r1 = trainer.train() r2 = trainer.train() r3 = trainer.train() trainer.stop() # Eval results are not available at step 0. # But step 3 should still have it, even though no eval was # run during that step. self.assertTrue("evaluation" in r0) self.assertTrue("evaluation" in r1) self.assertTrue("evaluation" in r2) self.assertTrue("evaluation" in r3)
def test_evaluation_option(self): config = dqn.DEFAULT_CONFIG.copy() config.update({ "env": "CartPole-v0", "evaluation_interval": 2, "evaluation_num_episodes": 2, "evaluation_config": { "gamma": 0.98, } }) for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = dqn.DQNTrainer(config=config) # Given evaluation_interval=2, r0, r2, r4 should not contain # evaluation metrics, while r1, r3 should. r0 = trainer.train() print(r0) r1 = trainer.train() print(r1) r2 = trainer.train() print(r2) r3 = trainer.train() print(r3) trainer.stop() self.assertFalse("evaluation" in r0) self.assertTrue("evaluation" in r1) self.assertFalse("evaluation" in r2) self.assertTrue("evaluation" in r3) self.assertTrue("episode_reward_mean" in r1["evaluation"]) self.assertNotEqual(r1["evaluation"], r3["evaluation"])
def test_dqn_fake_multi_gpu_learning(self): """Test whether DQNTrainer can learn CartPole w/ faked multi-GPU.""" config = copy.deepcopy(dqn.DEFAULT_CONFIG) # Fake GPU setup. config["num_gpus"] = 2 config["_fake_gpus"] = True # Double batch size (2 GPUs). config["train_batch_size"] = 64 # Mimic tuned_example for DQN CartPole. config["n_step"] = 3 config["model"]["fcnet_hiddens"] = [64] config["model"]["fcnet_activation"] = "linear" for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = dqn.DQNTrainer(config=config, env="CartPole-v0") num_iterations = 200 learnt = False for i in range(num_iterations): results = trainer.train() print("reward={}".format(results["episode_reward_mean"])) if results["episode_reward_mean"] > 65.0: learnt = True break assert learnt, \ "DQN multi-GPU (with fake-GPUs) did not learn CartPole!" trainer.stop()
def get_dqn_car_trainer(): ModelCatalog.register_custom_model("my_model", TorchCustomModel) config = { "env": StoppingCar, # "model": { "custom_model": "my_model", "fcnet_hiddens": [16], "fcnet_activation": "relu" }, # model config, # "vf_share_layers": False, # try different lrs # "vf_clip_param": 100, "lr": 0.001, # "clip_rewards": False, # 500*1000, "grad_clip": 2500, # "worker_side_prioritization": True, "num_workers": 8, # parallelism # "batch_mode": "complete_episodes", "batch_mode": "truncate_episodes", "rollout_fragment_length": 2000, "num_envs_per_worker": 10, "train_batch_size": 4000, "hiddens": [16], "framework": "torch", "horizon": 8000, "evaluation_config": { # Example: overriding env_config, exploration, etc: # "env_config": {...}, "explore": False }, } trainer = dqn.DQNTrainer(config=config) return trainer, config
def test_on_sub_environment_created_with_remote_envs(self): config = { "env": "CartPole-v1", # Make each sub-environment a ray actor. "remote_worker_envs": True, # Create 4 sub-environments (ray remote actors) per remote # worker. "num_envs_per_worker": 4, # Create 2 remote workers. "num_workers": 2, "callbacks": OnSubEnvironmentCreatedCallback, } for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = dqn.DQNTrainer(config=config) # Fake the counter on the local worker (doesn't have an env) and # set it to -1 so the below `foreach_worker()` won't fail. trainer.workers.local_worker().sum_sub_env_vector_indices = -1 # Get sub-env vector index sums from the 2 remote workers: sum_sub_env_vector_indices = trainer.workers.foreach_worker( lambda w: w.sum_sub_env_vector_indices) # Local worker has no environments -> Expect the -1 special # value returned by the above lambda. self.assertTrue(sum_sub_env_vector_indices[0] == -1) # Both remote workers (index 1 and 2) have a vector index counter # of 6 (sum of vector indices: 0 + 1 + 2 + 3). self.assertTrue(sum_sub_env_vector_indices[1] == 6) self.assertTrue(sum_sub_env_vector_indices[2] == 6) trainer.stop()
def trainDqn(numIter): """ train """ ray.shutdown() ray.init() config = createConfig() trainer = dqn.DQNTrainer(config=config, env=HiLoPricingEnv) for i in range(numIter): print("\n**** next iteration " + str(i)) HiLoPricingEnv.count = 0 result = trainer.train() print(pretty_print(result)) print("env reset count " + str(HiLoPricingEnv.count)) policy = trainer.get_policy() weights = policy.get_weights() #print("policy weights") #print(weights) model = policy.model #summary = model.base_model.summary() #print("model summary") #print(weights) return trainer
def __new__(cls, config={}): name = config.pop('agent', None) if name == "DQN": return dqn.DQNTrainer(config=config) elif name == "PPO": return ppo.APPOTrainer(config=config) else: raise Exception("{} agent is not supported".format(name))
def test_dqn_compilation(self): """Test whether a DQNTrainer can be built with both frameworks.""" config = dqn.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # tf. config["eager"] = True trainer = dqn.DQNTrainer(config=config, env="CartPole-v0") num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results) config["eager"] = False trainer = dqn.DQNTrainer(config=config, env="CartPole-v0") num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results)
def loadTrainer(path): """ load trainer from checkpoint """ ray.shutdown() ray.init() config = createConfig() trainer = dqn.DQNTrainer(config=config, env=HiLoPricingEnv) trainer.restore(path) return trainer
def test_dqn_compilation(self): """Test whether a DQNTrainer can be built with both frameworks.""" config = dqn.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Rainbow. rainbow_config = config.copy() rainbow_config["eager"] = False rainbow_config["num_atoms"] = 10 rainbow_config["noisy"] = True rainbow_config["double_q"] = True rainbow_config["dueling"] = True rainbow_config["n_step"] = 5 trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0") num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results) # tf. tf_config = config.copy() tf_config["eager"] = False trainer = dqn.DQNTrainer(config=tf_config, env="CartPole-v0") num_iterations = 1 for i in range(num_iterations): results = trainer.train() print(results) # Eager. eager_config = config.copy() eager_config["eager"] = True eager_ctx = eager_mode() eager_ctx.__enter__() trainer = dqn.DQNTrainer(config=eager_config, env="CartPole-v0") num_iterations = 1 for i in range(num_iterations): results = trainer.train() print(results) eager_ctx.__exit__(None, None, None)
def test_leaky_policy(self): """Tests, whether our diagnostics tools can detect leaks in a policy.""" config = dqn.DEFAULT_CONFIG.copy() # Make sure we have an env to test on the local worker. # Otherwise, `check_memory_leaks` will complain. config["create_env_on_driver"] = True config["env"] = "CartPole-v0" config["multiagent"]["policies"] = { "default_policy": PolicySpec(policy_class=MemoryLeakingPolicy), } trainer = dqn.DQNTrainer(config=config) results = check_memory_leaks(trainer, to_check={"policy"}, repeats=300) assert results["policy"] trainer.stop()
def _discrete_run(self): from ray import tune from ray.rllib.agents import dqn from ray.rllib.agents.dqn import DEFAULT_CONFIG DEFAULT_CONFIG['framework']='torch' if self.configs['mode']: DEFAULT_CONFIG['double_q']=False else: DEFAULT_CONFIG['double_q']=True import ray AGENT_CONFIG={'dqn':dqn.DQNTrainer, 'ddqn':dqn.DQNTrainer(config=DEFAULT_CONFIG,env='CartPole-v0')} agent=AGENT_CONFIG[self.configs['algorithm']] tune.run(agent, config={"env": "CartPole-v0","framework":"torch"})
def train_rllib_policy(config): """Trains a DQNTrainer on MsPacman-v0 for n iterations. Saves the trained Trainer to disk and returns the checkpoint path. Returns: str: The saved checkpoint to restore the trainer DQNTrainer from. """ # Create trainer from config. trainer = dqn.DQNTrainer(config=config) # Train for n iterations, then save. for _ in range(args.train_iters): print(trainer.train()) return trainer.save()
def get_rl_agent(agent_name, config, env_to_agent): if agent_name == A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) else: raise Exception("Not valid agent name") return agent
def test_traj_view_normal_case(self): """Tests, whether Model and Policy return the correct ViewRequirements. """ config = dqn.DEFAULT_CONFIG.copy() config["num_envs_per_worker"] = 10 config["rollout_fragment_length"] = 4 for _ in framework_iterator(config): trainer = dqn.DQNTrainer( config, env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv") policy = trainer.get_policy() view_req_model = policy.model.inference_view_requirements view_req_policy = policy.view_requirements assert len(view_req_model) == 1, view_req_model assert len(view_req_policy) == 8, view_req_policy for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS, SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, "weights", ]: assert key in view_req_policy # None of the view cols has a special underlying data_col, # except next-obs. if key != SampleBatch.NEXT_OBS: assert view_req_policy[key].data_col is None else: assert view_req_policy[key].data_col == SampleBatch.OBS assert view_req_policy[key].data_rel_pos == 1 rollout_worker = trainer.workers.local_worker() sample_batch = rollout_worker.sample() expected_count = \ config["num_envs_per_worker"] * \ config["rollout_fragment_length"] assert sample_batch.count == expected_count for v in sample_batch.data.values(): assert len(v) == expected_count trainer.stop()
def get_rllib_agent(agent_name, env_name, env, env_to_agent): config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {} if agent_name == RLLIB_A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) return agent
def create_agent(args): """Create XXX agent. Args: args (argparse.Namespace): argparse arguments. Returns: agent (ray.rllib.agents.trainer_template.XXX): XXX agent. """ # Custom configuration config = dqn.DEFAULT_CONFIG.copy() config["double_q"] = True config["dueling"] = True config["framework"] = "torch" config["lr"] = 5e-4 config["num_gpus"] = 1 config["num_workers"] = 1 config["train_batch_size"] = 128 # Custom model config["model"]["fcnet_activation"] = "relu" config["model"]["fcnet_hiddens"] = [64, 64] # Agent creation agent = dqn.DQNTrainer(env=GymEnv, config=config) # To optionally load a checkpoint if args.checkpoint: agent.restore(args.checkpoint) # Print model if args.verbose > 0: model = agent.get_policy().model if config["framework"] == "tf": print(type(model.base_model.summary())) elif config["framework"] == "torch": print(model) return agent
def test_dqn_exploration_and_soft_q_config(self): """Tests, whether a DQN Agent outputs exploration/softmaxed actions.""" config = dqn.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs = np.array(0) # Test against all frameworks. for _ in framework_iterator(config): # Default EpsilonGreedy setup. trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") # Setting explore=False should always return the same action. a_ = trainer.compute_action(obs, explore=False) for _ in range(50): a = trainer.compute_action(obs, explore=False) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for _ in range(50): actions.append(trainer.compute_action(obs)) check(np.std(actions), 0.0, false=True) # Low softmax temperature. Behaves like argmax # (but no epsilon exploration). config["exploration_config"] = { "type": "SoftQ", "temperature": 0.000001 } trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") # Due to the low temp, always expect the same action. actions = [trainer.compute_action(obs)] for _ in range(50): actions.append(trainer.compute_action(obs)) check(np.std(actions), 0.0, decimals=3) # Higher softmax temperature. config["exploration_config"]["temperature"] = 1.0 trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") # Even with the higher temperature, if we set explore=False, we # should expect the same actions always. a_ = trainer.compute_action(obs, explore=False) for _ in range(50): a = trainer.compute_action(obs, explore=False) check(a, a_) # Due to the higher temp, expect different actions avg'ing # around 1.5. actions = [] for _ in range(300): actions.append(trainer.compute_action(obs)) check(np.std(actions), 0.0, false=True) # With Random exploration. config["exploration_config"] = {"type": "Random"} config["explore"] = True trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") actions = [] for _ in range(300): actions.append(trainer.compute_action(obs)) check(np.std(actions), 0.0, false=True)
def test_dqn_parameter_noise_exploration(self): """Tests, whether a DQN Agent works with ParameterNoise.""" obs = np.array(0) core_config = dqn.DEFAULT_CONFIG.copy() core_config["num_workers"] = 0 # Run locally. core_config["env_config"] = {"is_slippery": False, "map_name": "4x4"} # Test against all frameworks. for fw in framework_iterator(core_config): config = core_config.copy() # DQN with ParameterNoise exploration (config["explore"]=True). # ---- config["exploration_config"] = {"type": "ParameterNoise"} config["explore"] = True trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") policy = trainer.get_policy() p_sess = getattr(policy, "_sess", None) self.assertFalse(policy.exploration.weights_are_currently_noisy) noise_before = self._get_current_noise(policy, fw) check(noise_before, 0.0) initial_weights = self._get_current_weight(policy, fw) # Pseudo-start an episode and compare the weights before and after. policy.exploration.on_episode_start(policy, tf_sess=p_sess) self.assertFalse(policy.exploration.weights_are_currently_noisy) noise_after_ep_start = self._get_current_noise(policy, fw) weights_after_ep_start = self._get_current_weight(policy, fw) # Should be the same, as we don't do anything at the beginning of # the episode, only one step later. check(noise_after_ep_start, noise_before) check(initial_weights, weights_after_ep_start) # Setting explore=False should always return the same action. a_ = trainer.compute_action(obs, explore=False) self.assertFalse(policy.exploration.weights_are_currently_noisy) noise = self._get_current_noise(policy, fw) # We sampled the first noise (not zero anymore). check(noise, 0.0, false=True) # But still not applied b/c explore=False. check(self._get_current_weight(policy, fw), initial_weights) for _ in range(10): a = trainer.compute_action(obs, explore=False) check(a, a_) # Noise never gets applied. check(self._get_current_weight(policy, fw), initial_weights) self.assertFalse( policy.exploration.weights_are_currently_noisy) # Explore=None (default: True) should return different actions. # However, this is only due to the underlying epsilon-greedy # exploration. actions = [] current_weight = None for _ in range(10): actions.append(trainer.compute_action(obs)) self.assertTrue(policy.exploration.weights_are_currently_noisy) # Now, noise actually got applied (explore=True). current_weight = self._get_current_weight(policy, fw) check(current_weight, initial_weights, false=True) check(current_weight, initial_weights + noise) check(np.std(actions), 0.0, false=True) # Pseudo-end the episode and compare weights again. # Make sure they are the original ones. policy.exploration.on_episode_end(policy, tf_sess=p_sess) weights_after_ep_end = self._get_current_weight(policy, fw) check(current_weight - noise, weights_after_ep_end, decimals=5) # DQN with ParameterNoise exploration (config["explore"]=False). # ---- config = core_config.copy() config["exploration_config"] = {"type": "ParameterNoise"} config["explore"] = False trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") policy = trainer.get_policy() p_sess = getattr(policy, "_sess", None) self.assertFalse(policy.exploration.weights_are_currently_noisy) initial_weights = self._get_current_weight(policy, fw) # Noise before anything (should be 0.0, no episode started yet). noise = self._get_current_noise(policy, fw) check(noise, 0.0) # Pseudo-start an episode and compare the weights before and after # (they should be the same). policy.exploration.on_episode_start(policy, tf_sess=p_sess) self.assertFalse(policy.exploration.weights_are_currently_noisy) # Should be the same, as we don't do anything at the beginning of # the episode, only one step later. noise = self._get_current_noise(policy, fw) check(noise, 0.0) noisy_weights = self._get_current_weight(policy, fw) check(initial_weights, noisy_weights) # Setting explore=False or None should always return the same # action. a_ = trainer.compute_action(obs, explore=False) # Now we have re-sampled. noise = self._get_current_noise(policy, fw) check(noise, 0.0, false=True) for _ in range(5): a = trainer.compute_action(obs, explore=None) check(a, a_) a = trainer.compute_action(obs, explore=False) check(a, a_) # Pseudo-end the episode and compare weights again. # Make sure they are the original ones (no noise permanently # applied throughout the episode). policy.exploration.on_episode_end(policy, tf_sess=p_sess) weights_after_episode_end = self._get_current_weight(policy, fw) check(initial_weights, weights_after_episode_end) # Noise should still be the same (re-sampling only happens at # beginning of episode). noise_after = self._get_current_noise(policy, fw) check(noise, noise_after) # Switch off EpsilonGreedy underlying exploration. # ---- config = core_config.copy() config["exploration_config"] = { "type": "ParameterNoise", "sub_exploration": { "type": "EpsilonGreedy", "action_space": trainer.get_policy().action_space, "initial_epsilon": 0.0, # <- no randomness whatsoever } } config["explore"] = True trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") # Now, when we act - even with explore=True - we would expect # the same action for the same input (parameter noise is # deterministic). policy = trainer.get_policy() p_sess = getattr(policy, "_sess", None) policy.exploration.on_episode_start(policy, tf_sess=p_sess) a_ = trainer.compute_action(obs) for _ in range(10): a = trainer.compute_action(obs, explore=True) check(a, a_)
def main(): args = parser.parse_args() ray.init() if args.agent not in ["DQN", "SlateQ"]: raise ValueError(args.agent) env_config = { "slate_size": args.env_slate_size, "seed": args.env_seed, "convert_to_discrete_action_space": args.agent == "DQN", } if args.use_tune: time_signature = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") name = f"SlateQ/{args.agent}-seed{args.env_seed}-{time_signature}" if args.agent == "DQN": tune.run( "DQN", stop={"timesteps_total": 4000000}, name=name, config={ "env": recsim_env_name, "num_gpus": args.num_gpus, "num_workers": args.num_workers, "env_config": env_config, }, num_samples=args.tune_num_samples, verbose=1, ) else: tune.run( "SlateQ", stop={"timesteps_total": 4000000}, name=name, config={ "env": recsim_env_name, "num_gpus": args.num_gpus, "num_workers": args.num_workers, "slateq_strategy": tune.grid_search(ALL_SLATEQ_STRATEGIES), "env_config": env_config, }, num_samples=args.tune_num_samples, verbose=1, ) else: # directly run using the trainer interface (good for debugging) if args.agent == "DQN": config = dqn.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 0 config["env_config"] = env_config trainer = dqn.DQNTrainer(config=config, env=recsim_env_name) else: config = slateq.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 0 config["slateq_strategy"] = args.strategy config["env_config"] = env_config trainer = slateq.SlateQTrainer(config=config, env=recsim_env_name) for i in range(10): result = trainer.train() print(pretty_print(result)) ray.shutdown()
def main(): args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) env_config = { "num_candidates": args.env_num_candidates, "resample_documents": not args.env_dont_resample_documents, "slate_size": args.env_slate_size, "seed": args.env_seed, "convert_to_discrete_action_space": args.run == "DQN", } config = { "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution" else InterestExplorationRecSimEnv if args.env == "interest-exploration" else LongTermSatisfactionRecSimEnv), "framework": args.framework, "num_gpus": args.num_gpus, "num_workers": args.num_workers, "env_config": env_config, "learning_starts": args.learning_starts, } # Perform a test run on the env with a random agent to see, what # the random baseline reward is. if args.random_test_episodes: print(f"Running {args.random_test_episodes} episodes to get a random " "agent's baseline reward ...") env = config["env"](config=env_config) env.reset() num_episodes = 0 episode_rewards = [] episode_reward = 0.0 while num_episodes < args.random_test_episodes: action = env.action_space.sample() _, r, d, _ = env.step(action) episode_reward += r if d: num_episodes += 1 episode_rewards.append(episode_reward) episode_reward = 0.0 env.reset() print(f"Ran {args.random_test_episodes} episodes with a random agent " "reaching a mean episode return of " f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.") if args.use_tune: stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } if args.run == "SlateQ": config.update({ "slateq_strategy": args.slateq_strategy, }) results = tune.run( args.run, stop=stop, config=config, num_samples=args.tune_num_samples, verbose=2, ) if args.as_test: check_learning_achieved(results, args.stop_reward) else: # Directly run using the trainer interface (good for debugging). if args.run == "DQN": trainer = dqn.DQNTrainer(config=config) else: config.update({ "slateq_strategy": args.slateq_strategy, }) trainer = slateq.SlateQTrainer(config=config) for i in range(10): result = trainer.train() print(pretty_print(result)) ray.shutdown()
mc = max(checkpoint_numbers) checkpoint_path = path_to_results+"/"+"checkpoint_{}/checkpoint-{}".format(mc,mc) print("found {} checkpoints".format(len(checkpoint_numbers))) print("restoring "+checkpoint_path) # ============================================================== # # evaluation {{{ # ============================================================== # #ray.init() ray.init(temp_dir=tmpdir+"/ray") # you may need to change the temp directory in case it runs on a cluster or shared machine if config["optimizer_class"] == "AsyncReplayOptimizer": trainer = dqn.ApexTrainer(config=config, env=CodeEnv) else: trainer = dqn.DQNTrainer(config=config, env=CodeEnv) trainer.restore(checkpoint_path) env = CodeEnv(env_config) n = env.n dB_len = len(dB_range) BitErr = np.zeros([dB_len], dtype=int) CwErr = np.zeros([dB_len], dtype=int) totCw = np.zeros([dB_len], dtype=int) totBit = np.zeros([dB_len], dtype=int) for i in range(dB_len): print("\n--------\nSimulating EbNo = {} dB".format(dB_range[i])) env.set_EbNo_dB(dB_range[i]) while(CwErr[i]<minCwErr and totCw[i]+1<=maxCw):