def test_cartpole_lstm(self): with ray_start_client_server(): assert ray.util.client.ray.is_connected() config = dict( { "num_sgd_iter": 5, "model": { "vf_share_layers": True, }, "vf_loss_coeff": 0.0001, }, **{ "env": StatelessCartPole, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "model": { "use_lstm": True, "lstm_cell_size": 256, "lstm_use_prev_action": None, "lstm_use_prev_reward": None, }, "framework": "tf", # Run with tracing enabled for tfe/tf2? "eager_tracing": None, }) stop = { "training_iteration": 200, "timesteps_total": 100000, "episode_reward_mean": 150.0, } results = tune.run("PPO", config=config, stop=stop, verbose=2) check_learning_achieved(results, 150.0)
def test_curiosity_on_partially_observable_domain(self): config = ppo.DEFAULT_CONFIG.copy() config["env"] = "mini-grid" config["env_config"] = { # Also works with: # - MiniGrid-MultiRoom-N4-S5-v0 # - MiniGrid-MultiRoom-N2-S4-v0 "name": "MiniGrid-Empty-8x8-v0", "framestack": 1, # seems to work even w/o framestacking } config["horizon"] = 15 # Make it impossible to reach goal by chance. config["num_envs_per_worker"] = 4 config["model"]["fcnet_hiddens"] = [256, 256] config["model"]["fcnet_activation"] = "relu" config["num_sgd_iter"] = 8 config["num_workers"] = 0 config["exploration_config"] = { "type": "Curiosity", # For the feature NN, use a non-LSTM fcnet (same as the one # in the policy model). "eta": 0.1, "lr": 0.0003, # 0.0003 or 0.0005 seem to work fine as well. "feature_dim": 64, # No actual feature net: map directly from observations to feature # vector (linearly). "feature_net_config": { "fcnet_hiddens": [], "fcnet_activation": "relu", }, "sub_exploration": { "type": "StochasticSampling", } } min_reward = 0.001 stop = { "training_iteration": 25, "episode_reward_mean": min_reward, } for _ in framework_iterator(config, frameworks="torch"): # To replay: # trainer = ppo.PPOTrainer(config=config) # trainer.restore("[checkpoint file]") # env = env_maker(config["env_config"]) # s = env.reset() # for _ in range(10000): # s, r, d, _ = env.step(trainer.compute_action(s)) # if d: # s = env.reset() # env.render() results = tune.run("PPO", config=config, stop=stop, verbose=1) check_learning_achieved(results, min_reward) iters = results.trials[0].last_result["training_iteration"] print("Reached in {} iterations.".format(iters))
def run_same_policy(args, stop): """Use the same policy for both agents (trivial case).""" config = { "env": RockPaperScissors, "framework": "torch" if args.torch else "tf", } results = tune.run("PG", config=config, stop=stop, verbose=1) if args.as_test: # Check vs 0.0 as we are playing a zero-sum game. check_learning_achieved(results, 0.0)
def train_model(self, run, config, stop, output_folder, checkpoint_freq, save_folder, latest_checkpoint_path, as_test, stop_reward): """ Method for training the model, where the tune command of rllib is used. :param run: Deep Reinforcement Learning algorithm. :param config: Training configuation. :param stop: Stop condition during the training. :param output_folder: Path where the checkpoints are saved. :param checkpoint_freq: Frequency of saving the next checkpoint. :param save_folder: Detailed save path of the checkpoints. :param latest_checkpoint_path: Path to the latest checkpoint to continue the training. If it is None, a new session will be created. :param as_test: Boolean to test, if the training was successful. :param stop_reward: After the training, it will be checked, if the 'stop_reward' is reachable. """ if stop is None: raise Exception("No stop condition was set for the training! You need at least one.") results = tune.run(run, config=config, stop=stop, checkpoint_at_end=True, local_dir=output_folder, checkpoint_freq=checkpoint_freq, name=save_folder, restore=latest_checkpoint_path) if as_test: check_learning_achieved(results, stop_reward) ray.shutdown()
"evaluation_num_workers": args.evaluation_num_workers, # Evaluate every other training iteration (together # with every other call to Trainer.train()). "evaluation_interval": args.evaluation_interval, # Run for n episodes/timesteps (properly distribute load amongst # all eval workers). The longer it takes to evaluate, the more sense # it makes to use `evaluation_parallel_to_training=True`. # Use "auto" to run evaluation for roughly as long as the training # step takes. "evaluation_duration": args.evaluation_duration, # "episodes" or "timesteps". "evaluation_duration_unit": args.evaluation_duration_unit, # Use a custom callback that asserts that we are running the # configured exact number of episodes per evaluation OR - in auto # mode - run at least as many episodes as we have eval workers. "callbacks": AssertEvalCallback, } stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } results = tune.run(args.run, config=config, stop=stop, verbose=2) if args.as_test: check_learning_achieved(results, args.stop_reward) ray.shutdown()
def main(): args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) env_config = { "num_candidates": args.env_num_candidates, "resample_documents": not args.env_dont_resample_documents, "slate_size": args.env_slate_size, "seed": args.env_seed, "convert_to_discrete_action_space": args.run == "DQN", } config = { "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution" else InterestExplorationRecSimEnv if args.env == "interest-exploration" else LongTermSatisfactionRecSimEnv), "framework": args.framework, "num_gpus": args.num_gpus, "num_workers": args.num_workers, "env_config": env_config, "learning_starts": args.learning_starts, } # Perform a test run on the env with a random agent to see, what # the random baseline reward is. if args.random_test_episodes: print(f"Running {args.random_test_episodes} episodes to get a random " "agent's baseline reward ...") env = config["env"](config=env_config) env.reset() num_episodes = 0 episode_rewards = [] episode_reward = 0.0 while num_episodes < args.random_test_episodes: action = env.action_space.sample() _, r, d, _ = env.step(action) episode_reward += r if d: num_episodes += 1 episode_rewards.append(episode_reward) episode_reward = 0.0 env.reset() print(f"Ran {args.random_test_episodes} episodes with a random agent " "reaching a mean episode return of " f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.") if args.use_tune: stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } if args.run == "SlateQ": config.update({ "slateq_strategy": args.slateq_strategy, }) results = tune.run( args.run, stop=stop, config=config, num_samples=args.tune_num_samples, verbose=2, ) if args.as_test: check_learning_achieved(results, args.stop_reward) else: # Directly run using the trainer interface (good for debugging). if args.run == "DQN": trainer = dqn.DQNTrainer(config=config) else: config.update({ "slateq_strategy": args.slateq_strategy, }) trainer = slateq.SlateQTrainer(config=config) for i in range(10): result = trainer.train() print(pretty_print(result)) ray.shutdown()
"evaluation_num_workers": 2, # Optional custom eval function. "custom_eval_function": eval_fn, # Enable evaluation, once per training iteration. "evaluation_interval": 1, # Run 10 episodes each time evaluation runs. "evaluation_duration": 10, # Override the env config for evaluation. "evaluation_config": { "env_config": { # Evaluate using LONGER corridor than trained on. "corridor_length": 5, }, }, "framework": args.framework, } stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } results = tune.run("PG", config=config, stop=stop, verbose=1) # Check eval results (from eval workers using the custom function), # not results from the regular workers. if args.as_test: check_learning_achieved(results, args.stop_reward, evaluation=True) ray.shutdown()
"multiagent": { "policies_to_train": list(learned_planner_policy.keys()), "policies": { **citizen_policies, **learned_planner_policy }, "policy_mapping_fn": policy_mapping_fn, }, }) # training loop citizen_results = tune.run(args.run, config=citizen_config, stop=stop) # stop after some criteria, then train the government (some convergence criteria of the policies?) planner_results = tune.run(args.run, config=planner_config, stop=stop) if args.as_test: check_learning_achieved(citizen_results, args.stop_reward) check_learning_achieved(planner_results, args.stop_reward) else: # initialize policies policies = { "planner_policy": (PPOTFPolicy, env.planner_observation_space, env.planner_action_space, { "model": { "custom_model": "action_mask_model", "use_lstm": True, "custom_model_config": { "actual_obs_space": env.planner_observation_space["actual_obs"] }
def main(): args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) env_config = { "num_candidates": args.env_num_candidates, "resample_documents": not args.env_dont_resample_documents, "slate_size": args.env_slate_size, "seed": args.env_seed, "convert_to_discrete_action_space": args.run == "DQN", } config = { "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution" else InterestExplorationRecSimEnv if args.env == "interest-exploration" else LongTermSatisfactionRecSimEnv), "hiddens": [ 1024, 1024, ], "num_gpus": args.num_gpus, "num_workers": args.num_workers, "env_config": env_config, "lr_choice_model": 0.003, "lr_q_model": 0.003, "rollout_fragment_length": 4, "exploration_config": { "epsilon_timesteps": 50000, "final_epsilon": 0.02, }, "target_network_update_freq": 1, "tau": 5e-3, "evaluation_interval": 1, "evaluation_num_workers": 4, "evaluation_duration": 200, "evaluation_duration_unit": "episodes", "evaluation_parallel_to_training": True, } # Perform a test run on the env with a random agent to see, what # the random baseline reward is. if args.random_test_episodes: print(f"Running {args.random_test_episodes} episodes to get a random " "agent's baseline reward ...") env = config["env"](config=env_config) env.reset() num_episodes = 0 episode_rewards = [] episode_reward = 0.0 while num_episodes < args.random_test_episodes: action = env.action_space.sample() _, r, d, _ = env.step(action) episode_reward += r if d: num_episodes += 1 episode_rewards.append(episode_reward) episode_reward = 0.0 env.reset() print(f"Ran {args.random_test_episodes} episodes with a random agent " "reaching a mean episode return of " f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.") if args.use_tune: stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } time_signature = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") name = f"SlateQ/{args.run}-seed{args.env_seed}-{time_signature}" if args.run == "SlateQ": config.update({ "slateq_strategy": args.slateq_strategy, }) results = tune.run( args.run, stop=stop, name=name, config=config, num_samples=args.tune_num_samples, verbose=2, ) if args.as_test: check_learning_achieved(results, args.stop_reward) else: # Directly run using the trainer interface (good for debugging). if args.run == "DQN": trainer = dqn.DQNTrainer(config=config) else: config.update({ "slateq_strategy": args.slateq_strategy, }) trainer = slateq.SlateQTrainer(config=config) for i in range(10): result = trainer.train() print(pretty_print(result)) ray.shutdown()
def test_amtft_ipd(): from marltoolbox.examples.rllib_api.amtft_various_env import main ray.shutdown() # Restart Ray defensively in case the ray connection is lost. tune_analysis_per_welfare, analysis_metrics_per_mode = main(debug=False, train_n_replicates=1, filter_utilitarian=False) for welfare_name, tune_analysis in tune_analysis_per_welfare.items(): check_learning_achieved(tune_results=tune_analysis, reward=-204, min=True)
def test_ltft_ipd(): from marltoolbox.examples.rllib_api.ltft_ipd import main ray.shutdown() # Restart Ray defensively in case the ray connection is lost. tune_analysis_self_play, tune_analysis_naive_opponent = main(debug=False) check_learning_achieved(tune_results=tune_analysis_self_play, reward=-42, min=True) check_learning_achieved(tune_results=tune_analysis_naive_opponent, reward=-78, max=True)
def test_ppo_asym_coin_game(): from marltoolbox.examples.rllib_api.ppo_asymmetric_coin_game import main ray.shutdown() # Restart Ray defensively in case the ray connection is lost. tune_analysis = main(debug=False, stop_iters=70) check_learning_achieved(tune_results=tune_analysis, reward=20, min=True)
def test_pg_ipd(): from marltoolbox.examples.rllib_api.pg_ipd import main ray.shutdown() # Restart Ray defensively in case the ray connection is lost. tune_analysis = main(debug=False) check_learning_achieved(tune_results=tune_analysis, reward=-75, max=True)
def main(algorithm: str, stop_iters: int, stop_timesteps: int, stop_reward: float, as_test: bool, baselines_path: str, num_samples: int, eval_amount: float, seed: int) -> None: algorithm = algorithm.upper() ModelCatalog.register_custom_model("bn_model", TorchBatchNormModel) ModelCatalog.register_custom_model("skip_model", TorchSkipConnectionModel) ModelCatalog.register_custom_model("custom_weights", TorchCustomWeightsModel) ray.init(include_dashboard=False) if baselines_path: baselines_path = Path(baselines_path) baseline_datas: List[pd.DataFrame] = get_baselines(baselines_path) all_protos: Set[str] = set( itertools.chain.from_iterable(baseline_data.index for baseline_data in baseline_datas)) eval_baselines: List[pd.DataFrame] = list() if eval_amount: baseline_datas, eval_baselines = train_test_split( baseline_datas, test_size=eval_amount) action_space: str = 'multi_discrete' timed_thresholds: Optional[Sequence[Tuple[int, float]]] = None config: Dict[str, Any] = dict() if algorithm == 'A2C': config.update(a2c.A2C_DEFAULT_CONFIG) config["rollout_fragment_length"] = 20 config["use_gae"] = False config['vf_loss_coeff'] = .25 config["lr"] = 0.01 config['model']['fcnet_hiddens'] = [1024, 512, 256] config['min_iter_time_s'] = 20 timed_thresholds = [(int(1e4), -4.5), (int(4e4), -2), (int(6e4), -1), (int(8e4), 0), (int(1e5), .5), (int(1.2e5), .6), (int(1.5e5), .7), (int(2.5e5), .9)] elif algorithm == 'APEX': config.update(apex_dqn.APEX_DEFAULT_CONFIG) action_space = 'discrete' elif algorithm == 'PPO': config.update(ppo.DEFAULT_CONFIG) config["lr"] = 1e-5 config['entropy_coeff'] = 0.01 config['clip_param'] = .3 config['model']['fcnet_hiddens'] = [1024, 512, 256] config["rollout_fragment_length"] = 10 elif algorithm == 'DQN': config.update(dqn.DEFAULT_CONFIG) config['hiddens'] = tune.grid_search([[256], [256, 256]]) config['grad_clip'] = tune.grid_search([.5, 40]) action_space = 'discrete' elif algorithm == 'RAINBOW': algorithm = 'DQN' config['n_step'] = tune.grid_search([2, 5, 10]) config['noisy'] = True config['num_atoms'] = tune.grid_search([2, 5, 10]) config['v_min'] = -5.0 config['v_max'] = 1.0 # config["sigma0"] = tune.grid_search([]) config['hiddens'] = tune.grid_search([[256], [256, 256]]) config['grad_clip'] = tune.grid_search([.5, 40]) action_space = 'discrete' baselines_mutators = [ sizes_mult_mutator(8), # rand_by_x_percent_mutator(0.01), # switch_2_protocols_mutation, # shuffle_protocols_mutation ] inplace_mutations = [True] + [False] * (len(baselines_mutators) - 1) env_params = dict(action_space=action_space, std_coef=None, use_random_io_mask=True, baselines_mutators=baselines_mutators, inplace_mutations=inplace_mutations, all_protos=all_protos) config.update({ "env": RayNonLearningNetworkIoEnv, "env_config": env_config(baseline_datas, **env_params), "framework": "torch", "num_gpus": 0, "num_envs_per_worker": 4, 'num_workers': 2 }) if eval_baselines: config["evaluation_config"]["env_config"] = env_config( eval_baselines, **env_params) config["evaluation_num_episodes"] = 100 config["evaluation_interval"] = 5 lrs = sorted(c * 10**-i for i, c in itertools.product(range(2, 6), [1, 5])) # config["lr"] = tune.grid_search(lrs) # config['lr_schedule'] = tune.grid_search( # [double_middle_drop_lr_sched(lr, stop_timesteps) for lr in lrs] + [[(0, lr)] for lr in lrs] # ) config['lr_schedule'] = double_middle_drop_lr_sched( config["lr"], stop_timesteps) # config['lr_schedule'] = tune.grid_search([None, config['lr_schedule']]) rand_seeds = [ 565853, 2104103493, 1593411166, 2062870887, 606544299, 1297392272, 894118955, 759209631, 1951613876, 571931913, 1991302785, 316008064, 1894618127, 234605346, 1972456995, 1899998980, 1288798130, 1915494248, 1112988205, 311173854, 1631390566, 910695991, 1991670774, 1725533340, 1743250890, 1466896085, 322769861, 1922245188, 962566318, 446335427, 1071978696, 1202354470, 1770330545, 788227602, 104452329, 1431251508, 1898474473, 2145189166, 1469515549, 1517824005, 1642986198, 1516401273, 1853918493, 136233403, 1289467510, 1089288981, 1736900138, 2081164597, 1342690882, 9569014 ] config['seed'] = seed config['seed'] = tune.grid_search(rand_seeds) stop = { "training_iteration": stop_iters, "timesteps_total": stop_timesteps, # "episode_reward_mean": stop_reward, } stop = Stopper( stop_timesteps=stop_timesteps, stop_iters=stop_iters, # stop_reward=stop_reward, max_episodes_without_improvement=15, timed_thresholds=timed_thresholds) results = tune.run(algorithm, config=config, stop=stop, num_samples=num_samples, reuse_actors=True) if as_test: check_learning_achieved(results, stop_reward) ray.shutdown()