def run_experiment(params, optimize_hyperparams): """ Main experiment function """ # Choose environment env = get_benchmark_env(level=1) # Initialize AgentStats stats = {} stats['ppo'] = AgentStats(PPOAgent, env, init_kwargs=params['ppo'], eval_horizon=params['ppo']['horizon'], n_fit=2) stats['a2c'] = AgentStats(A2CAgent, env, init_kwargs=params['a2c'], eval_horizon=params['a2c']['horizon'], n_fit=2) agent_stats_list = stats.values() # Optimize hyperparams if optimize_hyperparams: for stats in agent_stats_list: # timeout after 20 seconds stats.optimize_hyperparams(n_trials=50, timeout=10) # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, n_sim=10) print(output)
def test_discount_optimization(): seeding.set_global_seed(42) class ValueIterationAgentToOptimize(ValueIterationAgent): @classmethod def sample_parameters(cls, trial): """ Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/) """ gamma = trial.suggest_categorical('gamma', [0.1, 0.99]) return {'gamma': gamma} env = GridWorld(nrows=3, ncols=10, reward_at={(1, 1): 0.1, (2, 9): 1.0}, walls=((1, 4), (2, 4), (1, 5)), success_probability=0.9) vi_params = {'gamma': 0.1, 'epsilon': 1e-3} vi_stats = AgentStats(ValueIterationAgentToOptimize, env, eval_horizon=20, init_kwargs=vi_params, n_fit=4, n_jobs=1) vi_stats.optimize_hyperparams(n_trials=5, timeout=30, n_sim=5, n_fit=1, n_jobs=1, sampler_method='random', pruner_method='none') assert vi_stats.best_hyperparams['gamma'] == 0.99
def run_experiment(params, optimize_hyperparams, rlberry_seed): """ Main experiment function """ seeding.set_global_seed(rlberry_seed) # Choose environment env = get_benchmark_env(level=1) # Initialize AgentStats stats = {} stats['ppo'] = AgentStats(PPOAgent, env, init_kwargs=params['ppo'], eval_horizon=params['ppo']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 2nd PPO thread # stats['ppo'].set_writer(1, None) stats['a2c'] = AgentStats(A2CAgent, env, init_kwargs=params['a2c'], eval_horizon=params['a2c']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 1st A2C thread # stats['a2c'].set_writer(0, None) agent_stats_list = stats.values() # Optimize hyperparams if optimize_hyperparams: for stats in agent_stats_list: # timeout after 20 seconds stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2) # Fit with best hyperparams and save results for stats in agent_stats_list: stats.fit() stats.save_results() # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, n_sim=10) print(output)
def test_hyperparam_optim_random(): # Define train env train_env = GridWorld() # Parameters params = {"n_episodes": 500} # Run AgentStats stats_agent = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10, n_jobs=1) # test hyperparameter optimization with random sampler stats_agent.optimize_hyperparams(sampler_method="random")
def test_hyperparam_optim_tpe(): # Define trainenv train_env = GridWorld() # Parameters params = {"n_episodes": 500} # Run AgentStats stats_agent = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10, n_jobs=1) # test hyperparameter optimization with TPE sampler # using hyperopt default values sampler_kwargs = TPESampler.hyperopt_parameters() stats_agent.optimize_hyperparams(sampler_kwargs=sampler_kwargs)
def test_agent_stats_2(): # Define train and evaluation envs train_env = GridWorld() eval_env = GridWorld() # Parameters params = {"n_episodes": 500} # Run AgentStats stats_agent1 = AgentStats(DummyAgent, train_env, eval_env=eval_env, init_kwargs=params, n_fit=4, eval_horizon=10, n_jobs=1) stats_agent2 = AgentStats(DummyAgent, train_env, eval_env=eval_env, init_kwargs=params, n_fit=4, eval_horizon=10, n_jobs=1) agent_stats_list = [stats_agent1, stats_agent2] # set some writers stats_agent1.set_writer(1, None) stats_agent1.set_writer(2, None) # compare final policies compare_policies(agent_stats_list, n_sim=10, show=False) compare_policies(agent_stats_list, n_sim=10, show=False, stationary_policy=False) # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # check if fitted for agent_stats in agent_stats_list: assert len(agent_stats.fitted_agents) == 4 for agent in agent_stats.fitted_agents: assert agent.fitted # test saving/loading dirname = stats_agent1.output_dir fname = dirname / 'stats' stats_agent1.save() loaded_stats = AgentStats.load(fname) assert stats_agent1.identifier == loaded_stats.identifier # delete file os.remove(fname.with_suffix('.pickle')) dirname.rmdir() # test hyperparemeter optimization loaded_stats.optimize_hyperparams()
def test_hyperparam_optim_grid(): # Define train env train_env = GridWorld() # Parameters params = {"n_episodes": 500} # Run AgentStats stats_agent = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10, n_jobs=1) # test hyperparameter optimization with grid sampler search_space = {"hyperparameter1": [1, 2, 3], "hyperparameter2": [-5, 0, 5]} sampler_kwargs = {"search_space": search_space} stats_agent.optimize_hyperparams(n_trials=3*3, sampler_method="grid", sampler_kwargs=sampler_kwargs)
def test_agent_stats_1(): # Define train and evaluation envs train_env = GridWorld() eval_env = GridWorld() # Parameters params = {"n_episodes": 500} horizon = 20 # Check DummyAgent agent = DummyAgent(train_env, **params) agent.fit() agent.policy(None) # Run AgentStats stats_agent1 = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) stats_agent2 = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) agent_stats_list = [stats_agent1, stats_agent2] # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies compare_policies(agent_stats_list, eval_env, eval_horizon=horizon, n_sim=10, show=False) compare_policies(agent_stats_list, eval_env, eval_horizon=horizon, n_sim=10, show=False, stationary_policy=False) # check if fitted for agent_stats in agent_stats_list: assert len(agent_stats.fitted_agents) == 4 for agent in agent_stats.fitted_agents: assert agent.fitted # test saving/loading stats_agent1.save('test_agent_stats_file.pickle') loaded_stats = AgentStats.load('test_agent_stats_file.pickle') assert stats_agent1.identifier == loaded_stats.identifier # delete file os.remove('test_agent_stats_file.pickle') # test hyperparemeter optimization loaded_stats.optimize_hyperparams() loaded_stats.optimize_hyperparams(continue_previous=True)
def test_agent_stats_seeding(): sd.set_global_seed(3456) for env in [MountainCar(), (gym_make, {'env_name': 'MountainCar-v0'})]: agent_stats = AgentStats(RSUCBVIAgent, env, init_kwargs={ 'n_episodes': 2, 'horizon': 10 }, n_fit=6) agent_stats.fit() for ii in range(2, agent_stats.n_fit): traj1 = get_env_trajectory(agent_stats.fitted_agents[ii - 2].env, horizon=10) traj2 = get_env_trajectory(agent_stats.fitted_agents[ii - 1].env, horizon=10) traj3 = get_env_trajectory(agent_stats.fitted_agents[ii].env, horizon=10) assert not compare_trajectories(traj1, traj2) assert not compare_trajectories(traj1, traj3) assert not compare_trajectories(traj2, traj3)
def test_agent_stats_partial_fit(): # Define train and evaluation envs train_env = GridWorld() eval_env = GridWorld() # Parameters params = {"n_episodes": 500} horizon = 20 # Check DummyAgent agent = DummyAgent(train_env, **params) agent.fit() agent.policy(None) # Run AgentStats stats = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) # Run partial fit stats.partial_fit(0.1) stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 0.6 for _ in range(2): stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 1.0 # learning curves plot_episode_rewards([stats], cumulative=True, show=False) # compare final policies compare_policies([stats], eval_env, eval_horizon=horizon, n_sim=10, show=False)
break env.close() # # Traning several agents and comparing different hyperparams # from rlberry.stats import AgentStats, MultipleStats, agent_stats, compare_policies stats = AgentStats( A2CAgent, env, eval_horizon=200, agent_name='A2C baseline', init_kwargs={ 'policy': 'MlpPolicy', 'verbose': 1 }, fit_kwargs={'total_timesteps': 1000}, policy_kwargs={'deterministic': True}, n_fit=4, n_jobs=4, joblib_backend='loky' ) # we might need 'threading' here, since stable baselines creates processes # 'multiprocessing' does not work, 'loky' seems good stats_alternative = AgentStats(A2CAgent, env, eval_horizon=200, agent_name='A2C high learning rate', init_kwargs={ 'policy': 'MlpPolicy',
'gamma': 0.99 } params_greedy = { 'n_episodes': 500, 'feature_map_fn': feature_map_fn, 'horizon': 10, 'bonus_scale_factor': 0.0, 'gamma': 0.99 } params_oracle = {'horizon': 10, 'gamma': 0.99} stats = AgentStats(LSVIUCBAgent, env, eval_horizon=10, init_kwargs=params, n_fit=4) stats_random = AgentStats(LSVIUCBAgent, env, eval_horizon=10, init_kwargs=params_greedy, n_fit=1, agent_name='LSVI-random-expl') oracle_stats = AgentStats(ValueIterationAgent, env, eval_horizon=10, init_kwargs=params_oracle, n_fit=1)
"bonus_scale_factor": BONUS_SCALE_FACTOR, "min_dist": MIN_DIST, "bandwidth": 0.1, "beta": 1.0, "kernel_type": "gaussian", } params_ppo = {"n_episodes": N_EPISODES, "gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} # ----------------------------- # Run AgentStats # ----------------------------- rsucbvi_stats = AgentStats(RSUCBVIAgent, train_env, init_kwargs=params, n_fit=4) rskernel_stats = AgentStats(RSKernelUCBVIAgent, train_env, init_kwargs=params_kernel, n_fit=4) ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4) agent_stats_list = [rsucbvi_stats, rskernel_stats, ppo_stats] # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, eval_env, eval_horizon=HORIZON, n_sim=10) print(output)
BONUS_SCALE_FACTOR = 0.1 MIN_DIST = 0.1 params_ppo = { "n_episodes": N_EPISODES, "gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003 } # ------------------------------- # Run AgentStats and save results # -------------------------------- ppo_stats = AgentStats(PPOAgent, train_env, eval_horizon=HORIZON, init_kwargs=params_ppo, n_fit=4) # hyperparam optim best_trial, data = ppo_stats.optimize_hyperparams( n_trials=10, timeout=None, n_sim=5, n_fit=2, n_jobs=2, sampler_method='optuna_default') initial_n_trials = len(ppo_stats.study.trials) # save
def test_agent_stats_partial_fit_and_tuple_env(): # Define train and evaluation envs train_env = (GridWorld, None ) # tuple (constructor, kwargs) must also work in AgentStats # Parameters params = {"n_episodes": 500} horizon = 20 # Run AgentStats stats = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) stats2 = AgentStats(DummyAgent, train_env, init_kwargs=params, n_fit=4, eval_horizon=10) # set some writers stats.set_writer(0, None) stats.set_writer(3, None) # Run partial fit stats.partial_fit(0.1) stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 0.6 for _ in range(2): stats.partial_fit(0.5) for agent in stats.fitted_agents: assert agent.fraction_fitted == 1.0 # Run fit stats2.fit() # learning curves plot_episode_rewards([stats], cumulative=True, show=False) # compare final policies compare_policies([stats], eval_horizon=horizon, n_sim=10, show=False)
params_ppo_bonus = { 'n_episodes': N_EPISODES, 'gamma': GAMMA, 'horizon': HORIZON, 'batch_size': 16, 'entr_coef': 8e-7, 'k_epochs': 10, 'eps_clip': 0.2, 'learning_rate': 0.03, 'use_bonus': True, 'uncertainty_estimator_kwargs': { 'uncertainty_estimator_fn': uncertainty_estimator_fn } } # ----------------------------- # Run AgentStats # ----------------------------- ppo_stats = AgentStats(PPOAgent, env, eval_env=eval_env, init_kwargs=params_ppo, n_fit=4, agent_name='PPO') ppo_bonus_stats = AgentStats(PPOAgent, env, eval_env=eval_env, init_kwargs=params_ppo_bonus, n_fit=4, agent_name='PPO-Bonus') agent_stats_list = [ppo_bonus_stats, ppo_stats] # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, eval_horizon=HORIZON, n_sim=20) print(output)
def env_constructor(n_envs=4): env = make_atari_env('MontezumaRevenge-v0', n_envs=n_envs) env = VecFrameStack(env, n_stack=4) return env # # Traning several agents and comparing different hyperparams # stats = AgentStats(A2CAgent, (env_constructor, None), eval_horizon=200, agent_name='A2C baseline', init_kwargs={ 'policy': 'CnnPolicy', 'verbose': 10 }, fit_kwargs={'total_timesteps': 1000}, policy_kwargs={'deterministic': True}, n_fit=4, n_jobs=4, joblib_backend='threading') stats_alternative = AgentStats(A2CAgent, (env_constructor, None), eval_horizon=200, agent_name='A2C high learning rate', init_kwargs={ 'policy': 'CnnPolicy', 'verbose': 10, 'learning_rate': 0.01 }, fit_kwargs={'total_timesteps': 1000},
def load_experiment_results(output_dir, experiment_name): """ Parameters ---------- output_dir : str or Path, or list directory (or list of directories) where experiment results are stored (command line argument --output_dir when running the eperiment) experiment_name : str or Path, or list name of yaml file describing the experiment. Returns ------- output_data: dict dictionary such that output_data['experiment_dirs'] = list of paths to experiment directory (output_dir/experiment_name) output_data['agent_list'] = list containing the names of the agents in the experiment output_data['stats'][agent_name] = fitted AgentStats for agent_name output_data['dataframes'][agent_name] = dict of pandas data frames from the last run of the experiment output_data['data_dir'][agent_name] = directory from which the results were loaded """ output_data = {} output_data['agent_list'] = [] output_data['stats'] = {} output_data['dataframes'] = {} output_data['data_dir'] = {} # preprocess input if not isinstance(output_dir, list): output_dir = [output_dir] if not isinstance(experiment_name, list): experiment_name = [experiment_name] ndirs = len(output_dir) if ndirs > 1: assert len( experiment_name ) == ndirs, "Number of experiment names must match the number of output_dirs " else: output_dir = len(experiment_name) * output_dir results_dirs = [] for dd, exper in zip(output_dir, experiment_name): results_dirs.append(Path(dd) / Path(exper).stem) output_data['experiment_dirs'] = results_dirs # Subdirectories with data for each agent subdirs = [] for dd in results_dirs: subdirs.extend([f for f in dd.iterdir() if f.is_dir()]) # Create dictionary dict[agent_name] = most recent result dir data_dirs = {} for dd in subdirs: data_dirs[dd.name] = _get_most_recent_path( [f for f in dd.iterdir() if f.is_dir()]) # Load data from each subdir for agent_name in data_dirs: output_data['agent_list'].append(agent_name) # store data_dir output_data['data_dir'][agent_name] = data_dirs[agent_name] # store AgentStats output_data['stats'][agent_name] = None fname = data_dirs[agent_name] / 'stats.pickle' try: output_data['stats'][agent_name] = AgentStats.load(fname) except Exception: pass logger.info("... loaded " + str(fname)) # store data frames dataframes = {} csv_files = [ f for f in data_dirs[agent_name].iterdir() if f.suffix == '.csv' ] for ff in csv_files: dataframes[ff.stem] = pd.read_csv(ff) logger.info("... loaded " + str(ff)) output_data['dataframes'][agent_name] = dataframes return output_data
env = GridWorld(nrows=5, ncols=10) params = {} params['ucbvi'] = { 'n_episodes': N_EP, 'horizon': HORIZON, 'stage_dependent': True, 'gamma': GAMMA, 'real_time_dp': True, 'bonus_scale_factor': 1.0, } params['optql'] = { 'n_episodes': N_EP, 'horizon': HORIZON, 'gamma': GAMMA, 'bonus_scale_factor': 1.0, } mstats = MultipleStats() mstats.append(AgentStats(UCBVIAgent, env, init_kwargs=params['ucbvi'])) mstats.append(AgentStats(OptQLAgent, env, init_kwargs=params['optql'])) mstats.run() plot_episode_rewards(mstats.allstats, cumulative=True)
N_EPISODES = 100 GAMMA = 0.99 HORIZON = 50 BONUS_SCALE_FACTOR = 0.1 MIN_DIST = 0.1 params_ppo = {"n_episodes": N_EPISODES, "gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} # ------------------------------- # Run AgentStats and save results # -------------------------------- ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4) ppo_stats.fit() # fit the 4 agents ppo_stats.save('ppo_stats') del ppo_stats # ------------------------------- # Load and plot results # -------------------------------- ppo_stats = AgentStats.load('ppo_stats') agent_stats_list = [ppo_stats] # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, eval_env,
"horizon": HORIZON } params_ppo = { "n_episodes": N_EPISODES, "gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003 } # ----------------------------- # Run AgentStats # ----------------------------- oracle_stats = AgentStats(MBQVIAgent, d_train_env, init_kwargs=params_oracle, n_fit=4, agent_name="Oracle") ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4, agent_name="PPO") agent_stats_list = [oracle_stats, ppo_stats] # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, eval_horizon=HORIZON, n_sim=10)
} params['rsucbvi'] = { 'n_episodes': N_EP, 'horizon': HORIZON, 'gamma': 1.0, 'bonus_scale_factor': 1.0, 'min_dist': 0.05, 'max_repr': 800 } mstats = MultipleStats() mstats.append( AgentStats(AdaptiveQLAgent, env, init_kwargs=params['adaql'], n_fit=4, n_jobs=4) ) mstats.append( AgentStats(RSUCBVIAgent, env, init_kwargs=params['rsucbvi'], n_fit=2) ) mstats.run(save=False) plot_episode_rewards(mstats.allstats, cumulative=True) for stats in mstats.allstats: agent = stats.fitted_agents[0] try: agent.Qtree.plot(0, 25)
from rlberry.agents.ppo import PPOAgent from rlberry.envs.benchmarks.ball_exploration import PBall2D from rlberry.seeding import seeding from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies seeding.set_global_seed(1223) env = PBall2D() n_episodes = 400 horizon = 100 ppo_params = {} ppo_params['n_episodes'] = 400 ppo_params['horizon'] = 100 ppo_params['gamma'] = 0.99 ppo_params['learning_rate'] = 0.001 ppo_params['eps_clip'] = 0.2 ppo_params['k_epochs'] = 4 ppo_stats = AgentStats(PPOAgent, env, eval_horizon=100, init_kwargs=ppo_params, n_fit=2) ppo_stats.partial_fit(0.3) plot_episode_rewards([ppo_stats], show=False, cumulative=True) compare_policies([ppo_stats], show=False) ppo_stats.partial_fit(0.2) plot_episode_rewards([ppo_stats], show=False, cumulative=True) compare_policies([ppo_stats], show=True)
# ----------------------------- N_EPISODES = 500 GAMMA = 0.99 HORIZON = 50 params_ppo = { "n_episodes": N_EPISODES, "gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003 } # ----------------------------- # Run AgentStats # ----------------------------- ppo_stats = AgentStats(PPOAgent, train_env, init_kwargs=params_ppo, n_fit=4) ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={'comment': 'worker_0'}) ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={'comment': 'worker_1'}) agent_stats_list = [ppo_stats] agent_stats_list[0].fit() agent_stats_list[0].save( ) # after fit, writers are set to None to avoid pickle problems. # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list,
def parse_experiment_config( path: Path, n_fit: int = 4, n_jobs: int = 4, output_base_dir: str = 'results', joblib_backend: str = 'loky' ) -> Generator[Tuple[int, AgentStats], None, None]: """ Read .yaml files. set global seed and convert to AgentStats instances. Exemple of experiment config: ```experiment.yaml description: 'My cool experiment' seed: 42 n_episodes: 1000 horizon: 50 train_env: 'env_train.yaml' # see read_env_config() eval_env: 'env_eval.yaml' agents: - 'agent1.yaml' # see read_agent_config() - 'agent2.yaml' ``` Parameters ---------- path : Path Path to an experiment config n_fit : int Number of instances of each agent to fit n_jobs : int Number of parallel jobs output_base_dir : str Directory where to save AgentStats results. Returns ------- seed: int global seed agent_stats: AgentStats the Agent Stats to fit """ with path.open() as file: config = yaml.safe_load(file) train_env = read_env_config(config["train_env"]) eval_env = read_env_config(config["eval_env"]) n_fit = n_fit n_jobs = n_jobs for agent_path in config["agents"]: # set seed before creating AgentStats seed = config["seed"] set_global_seed(seed) agent_name = Path(agent_path).stem agent_class, agent_config = read_agent_config(agent_path) # Process output dir, avoid erasing previous results output_dir = Path(output_base_dir) / path.stem / agent_name last = 0 try: subdirs = [f for f in output_dir.iterdir() if f.is_dir()] except FileNotFoundError: subdirs = [] for dd in subdirs: try: idx = int(dd.stem) except ValueError: continue if idx > last: last = idx # kwargs init_kwargs = agent_config['init_kwargs'] fit_kwargs = agent_config['fit_kwargs'] policy_kwargs = agent_config['policy_kwargs'] # check if there are global kwargs if 'global_init_kwargs' in config: init_kwargs.update(config['global_init_kwargs']) if 'global_fit_kwargs' in config: init_kwargs.update(config['global_fit_kwargs']) if 'global_policy_kwargs' in config: init_kwargs.update(config['global_policy_kwargs']) # check eval_horizon if 'eval_horizon' in config: eval_horizon = config['eval_horizon'] else: eval_horizon = None # append run index to dir output_dir = output_dir / str(last + 1) yield seed, AgentStats(agent_class=agent_class, init_kwargs=init_kwargs, fit_kwargs=fit_kwargs, policy_kwargs=policy_kwargs, agent_name=agent_name, train_env=train_env, eval_env=eval_env, eval_horizon=eval_horizon, n_fit=n_fit, n_jobs=n_jobs, output_dir=output_dir, joblib_backend=joblib_backend)