def main(): # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor if not USE_LOADED_MODEL: model = PPO2(MlpPolicy, env, verbose=1) # before training mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=50) print("Mean reward: {0}, Std reward: {1}".format( mean_reward, std_reward)) model.learn(total_timesteps=5000) # save model model.save("cartpole_v1_ppo2") loaded_model = PPO2.load("cartpole_v1_ppo2") loaded_model.set_env(env) # after training mean_reward, std_reward = evaluate_policy(loaded_model, env, n_eval_episodes=50) print("Mean reward: {0} +/- {1}".format(mean_reward, std_reward))
def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) else: action_noise = None model = model_class("MlpPolicy", env, gamma=0.1, seed=0, action_noise=action_noise, buffer_size=int(1e6)) model.learn(total_timesteps=20000) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) # Free memory del model, env
def train(): def callback(_locals, _globals): # Save model _locals['self'].save(MODEL_NAME) envs = [create_env_headless for _ in range(ENV_COUNT)] vec_envs = SubprocVecEnv(envs) model = PPO2('CnnPolicy', vec_envs, verbose=1, ent_coef=0.0001, n_steps=256) if not os.path.isfile(MODEL_NAME): model.save(MODEL_NAME) vec_envs.close() print("Run again to train") else: model.learn(total_timesteps=TIMESTEPS, callback=callback) model.save(MODEL_NAME) vec_envs.close() print("Training Done") # Evaluation print("Evaluation") vec_env = create_env_headless() vec_env = DummyVecEnv([lambda: vec_env]) model = PPO2.load(MODEL_NAME) print(evaluate_policy(model, vec_env, n_eval_episodes=100)) print(evaluate_policy(model, vec_env, n_eval_episodes=100)) vec_env.close()
def test_evaluate_policy(): model = A2C('MlpPolicy', 'Pendulum-v0', seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 model.n_callback_calls = 0 def dummy_callback(locals_, _globals): locals_['model'].n_callback_calls += 1 _, episode_lengths = evaluate_policy(model, model.get_env(), n_eval_episodes, deterministic=True, render=False, callback=dummy_callback, reward_threshold=None, return_episode_rewards=True) n_steps = sum(episode_lengths) assert n_steps == n_steps_per_episode * n_eval_episodes assert n_steps == model.n_callback_calls # Reaching a mean reward of zero is impossible with the Pendulum env with pytest.raises(AssertionError): evaluate_policy(model, model.get_env(), n_eval_episodes, reward_threshold=0.0) episode_rewards, _ = evaluate_policy(model, model.get_env(), n_eval_episodes, return_episode_rewards=True) assert len(episode_rewards) == n_eval_episodes
def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) n_steps = {SAC: 700, TD3: 500, DDPG: 2000}[model_class] kwargs = dict(seed=0, gamma=0.95, buffer_size=1e5) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions)) kwargs["action_noise"] = action_noise if model_class == DDPG: kwargs["actor_lr"] = 1e-3 kwargs["batch_size"] = 100 model = model_class("MlpPolicy", env, **kwargs) model.learn(total_timesteps=n_steps) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) # Free memory del model, env
def test_gail(tmp_path, expert_env): env_id, expert_path, load_from_memory = expert_env env = gym.make(env_id) traj_data = None if load_from_memory: traj_data = np.load(expert_path) expert_path = None dataset = ExpertDataset(traj_data=traj_data, expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(300) model.save(str(tmp_path / "GAIL-{}".format(env_id))) model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env) model.learn(300) evaluate_policy(model, env, n_eval_episodes=5) del dataset, model
def test_identity(model_name): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param model_name: (str) Name of the RL model """ env = DummyVecEnv([lambda: IdentityEnv(18, 18, 60)]) model = LEARN_FUNC_DICT[model_name](env) print('已经训练结束') evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=None) obs = env.reset() assert model.action_probability(obs).shape == ( 1, 18, ), "Error: action_probability not returning correct shape" action = env.action_space.sample() action_prob = model.action_probability(obs, actions=action) assert np.prod(action_prob.shape) == 1, "Error: not scalar probability" action_logprob = model.action_probability(obs, actions=action, logp=True) assert np.allclose(action_prob, np.exp(action_logprob)), ( action_prob, action_logprob, ) # Free memory del model, env
def eval_model(model, test_env_id): global eval_step, THRESHOLD test_success, curriculum_success = True, True performance_data[eval_step] = {} for env_id in env_ids: write_out( "[MODEL EVAL]\tTesting learner on env: {}".format(env_id)) env, eval_env, eval_callback = init_env(env_id) fresh_model = A2C(CnnPolicy, env, verbose=verbose) fresh_model.learn(total_timesteps=max_steps, callback=eval_callback) fresh_mean, fresh_std = evaluate_policy(fresh_model, eval_env, n_eval_episodes=100) model_mean, model_std = evaluate_policy(model, eval_env, n_eval_episodes=100) performance_data[eval_step][env_id] = { 'baseline_mean': fresh_mean, 'baseline_std': fresh_std, 'model_mean': model_mean, 'model_std': model_std, 'baseline_training_steps': max_steps, 'eval_episodes': 100 } write_out( "[MODEL EVAL: LEARNER] \t env_id: {}, Mean Reward: {}, std_dev: {}" .format(env_id, model_mean, model_std)) write_out( "[MODEL EVAL: BASELINE]\t env_id: {}, Mean Reward: {}, std_dev: {}" .format(env_id, fresh_mean, fresh_std)) pass_test = round(model_mean - model_std, 3) >= round( fresh_mean - fresh_std, 3) diff = abs( round(model_mean - model_std, 3) - round(fresh_mean - fresh_std, 3)) if pass_test: write_out( "[TEST RESULT]\tmodel out-performs fresh model for env: {}, diff: {}" .format(env_id, diff)) else: write_out( "[TEST RESULT]\tmodel DID NOT out-perform fresh model for env: {}, diff: {}" .format(env_id, diff)) if env_id == test_env_id: test_success = False curriculum_success = sum([ performance_data[eval_step][env_id]['baseline_mean'] > THRESHOLD for env_id in env_ids ]) == len(env_ids) eval_step += 1 return test_success, curriculum_success
def evaluate(): vec_env = create_env_headless() vec_env = DummyVecEnv([lambda: vec_env]) model = PPO2.load(MODEL_NAME) print("After Training evaluation") print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) vec_env.close()
def check_shape(make_env, model_class, shape_1, shape_2): model = model_class(policy="MlpPolicy", env=DummyVecEnv([make_env])) env0 = make_env() env1 = DummyVecEnv([make_env]) for env, expected_shape in [(env0, shape_1), (env1, shape_2)]: def callback(locals_, _globals): assert np.array(locals_['action']).shape == expected_shape evaluate_policy(model, env, n_eval_episodes=5, callback=callback)
def main(): env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) if not USE_LOADED_MODEL: model = ACKTR('MlpPolicy', env, verbose=1) # Multiprocessed RL Training start_time = time.time() model.learn(total_timesteps=n_timesteps, log_interval=10) total_time_multi = time.time() - start_time model.save("cartpole_v1_acktr") loaded_model = ACKTR.load("cartpole_v1_acktr") loaded_model.set_env(env) # Single Process RL Training single_process_model = ACKTR('MlpPolicy', env_id, verbose=1) start_time = time.time() single_process_model.learn(n_timesteps) total_time_single = time.time() - start_time print("Single-process: {0}s, Multi-process: {1}s".format( total_time_single, total_time_multi)) # create separate clean environment for evaluation eval_env = gym.make(env_id) mean_reward, std_reward = evaluate_policy(loaded_model, eval_env, n_eval_episodes=10) print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
def LunarLander_v2_DQN(): #TODO : 报错 # Create environment env = gym.make('LunarLander-v2') # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=100000) # Save the agent model.save("dqn_lunar") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("dqn_lunar") # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print(mean_reward, std_reward) # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def train(): env = LunarLander() # env = gym.make('LunarLander-v2') # Instantiate the agent model = PPO2( 'MlpPolicy', env, learning_rate=0.001, # prioritized_replay=True, verbose=1) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) # Train the agent timesteps = os.environ.get('TIMESTEPS') timesteps = int(float(timesteps)) if timesteps is not None else 1e6 print('timesteps %s' % timesteps) model.learn(total_timesteps=int(timesteps), log_interval=10) # Save the agent model.save("trained_models/latest") now = datetime.now() dt_string = now.strftime("%Y-%m-%d_%H-%M-%S") # model.save("trained_models/lunar_climber-%s" % dt_string) model.save("/opt/ml/model/lunar_climber-%s" % dt_string) # Plot training progress # plt.plot(env.all_rewards) # plt.ylabel('Reward') # plt.xlabel('Timesteps') # plt.savefig('figures/stats-%s.png' % dt_string) print("Model trained!")
def run_train(self, load_rand_state: bool = False) -> TrainResult: if load_rand_state: load_numpy_rand_state(self.rand_state_path) self.print("=" * 60) self.print(f"MODEL NAME:\t{self.model_name}") self.print("=" * 60) callbacks: List[BaseCallback] = self.create_callbacks(self.eval_env) self.model.learn(self.total_train_steps, tb_log_name=self.model_name, callback=callbacks) if self.use_eval_callback: self.model = type(self.model).load(os.path.join('non_learning_io_logs', self.model_name, "best_model.zip")) eval_pbar = tqdm(desc="Evaluation Episodes Progress", total=self.eval_episodes, file=sys.stdout) eval_callback = EvalPbarCallback(eval_pbar, self.verbose) mean_reward: float std_reward: float mean_reward, std_reward = evaluate_policy( self.model, self.eval_env, self.eval_episodes, callback=eval_callback, ) eval_pbar.close() result = TrainResult(mean_reward, std_reward, eval_callback.num_successes, self.eval_episodes) self.print(result.res_msg()) return result
def _eval_policy(self, eval_freq, eval_env, n_eval_episodes, timesteps_since_eval, deterministic=True): """ Evaluate the current policy on a test environment. :param eval_env: (gym.Env) Environment that will be used to evaluate the agent :param eval_freq: (int) Evaluate the agent every `eval_freq` timesteps (this may vary a little) :param n_eval_episodes: (int) Number of episode to evaluate the agent :parma timesteps_since_eval: (int) Number of timesteps since last evaluation :param deterministic: (bool) Whether to use deterministic or stochastic actions :return: (int) Number of timesteps since last evaluation """ if 0 < eval_freq <= timesteps_since_eval and eval_env is not None: timesteps_since_eval %= eval_freq # Synchronise the normalization stats if needed sync_envs_normalization(self.env, eval_env) mean_reward, std_reward = evaluate_policy( self, eval_env, n_eval_episodes, deterministic=deterministic) if self.verbose > 0: print("Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format( self.num_timesteps, mean_reward, std_reward)) print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - self.start_time))) return timesteps_since_eval
def _on_step(self, plot=True) -> bool: """Evaluate the current policy for self.eval_episodes, then take a render and report all stats to W&B Args: plot: Enable matplotlib plotting behavior. Should be set to True unless testing. Defaults to True. Returns: True, as per API requirements """ mean_rewards, std_rewards = evaluate_policy( self.model, self.env, n_eval_episodes=self.eval_episodes) images = [] rewards = [] actions = [] obses = [] step_cnt = 0 done, state = False, None obs = self.env.reset() while not done: if step_cnt % self.render_freq == 0: images.append(self.env.render(mode='rgb_array')) action, state = self.model.predict(obs, state=state, deterministic=True) obs, reward, done, _ = self.env.step(action) rewards.append(reward) actions.append(action) obses.append(obs) step_cnt += 1 render = np.array(images) render = np.transpose(render, (0, 3, 1, 2)) actions = np.array(actions).flatten() observes = np.array(obses).flatten() rewards = np.array(rewards) if plot: plt.clf() plt.plot(np.arange(len(rewards)), rewards) plt.xlabel('timesteps') plt.ylabel('rewards') plt.title('Timestep {}'.format(self.num_timesteps)) wandb.log({ 'test_reward_mean': mean_rewards, 'test_reward_std': std_rewards, 'render': wandb.Video(render, format='gif', fps=self.fps), 'global_step': self.num_timesteps, 'evaluations': self.n_calls, 'reward_distribution': wandb.Histogram(rewards), 'action_distribution': wandb.Histogram(actions), 'observation_distribution': wandb.Histogram(observes), 'reward_vs_time': plot and wandb.Image(plt), }, step=self.num_timesteps) return True
def fed_and_eval(base_index, w): base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0", n_envs=1, seed=seed) base_agent = ACKTR.load( f"./base_agent/{subenv_dict[base_index]}/model.zip") base_parameter_dict = base_agent.get_parameters() sub_model_parameters = [] for subenv in subenv_dict.values(): client_policy = ACKTR.load( f"./base{base_index}_client_model/{subenv}/policy.zip") sub_model_parameters.append(client_policy.get_parameters()) aligned_agent = base_agent base_parameter_dict = aligned_agent.get_parameters() model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha) aligned_agent.load_parameters(base_parameter_dict) avg_reward, reward_std = evaluate_policy(aligned_agent, base_env, n_eval_episodes=100) print(f"base {base_index}, weight {w} done") return (avg_reward, reward_std)
def __call__(self, locals_, globals_): """ :param locals_: (dict) :param globals_: (dict) :return: (bool) """ self.n_calls += 1 self.model = locals_['self'] self.num_timesteps = self.model.num_timesteps if self.n_calls % self.eval_freq == 0: episode_rewards, _ = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, deterministic=self.deterministic, return_episode_rewards=True) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) if self.verbose > 0: print("Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format( self.num_timesteps, mean_reward, std_reward)) if mean_reward > self.best_mean_reward: if self.best_model_save_path is not None: print("Saving best model") self.model.save(self.best_model_save_path) self.best_mean_reward = mean_reward return True
def test_cnn_lstm_policy(request, policy): model_fname = './test_model_{}.zip'.format(request.node.name) try: env = make_env(0) model = PPO2(policy, env, nminibatches=1) model.learn(total_timesteps=15) env = model.get_env() evaluate_policy(model, env, n_eval_episodes=5) # saving model.save(model_fname) del model, env # loading _ = PPO2.load(model_fname, policy=policy) finally: if os.path.exists(model_fname): os.remove(model_fname)
def retrain(mean_reward, target_reward, count, env, model): if mean_reward < target_reward: count = count + 1 model.learn(50) mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) if mean_reward < target_reward: retrain(mean_reward, target_reward, count, env, model) return True
def run(self, n_eval=5, verbose=True): # TODO add parameter to build ensemble for algo in self.algos_list: for env_name, env in self.envs.items(): mean_reward, std_reward = evaluate_policy(self.models[env_name][algo], env=env, n_eval_episodes=n_eval) # rewards[env_name][algo] = (mean_reward, std_reward) self.rewards[env_name][algo] = (mean_reward, std_reward) if verbose: print("============ Evaluación finalizada de " + algo + " en " + env_name + " ============") print(f"mean_reward={mean_reward}\n\n")
def test_identity_multibinary(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multibinary action space :param model_class: (BaseRLModel) A RL Model """ env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)]) model = model_class("MlpPolicy", env) model.learn(total_timesteps=1000) evaluate_policy(model, env, n_eval_episodes=5) obs = env.reset() assert model.action_probability(obs).shape == (1, 10), \ "Error: action_probability not returning correct shape" assert np.prod(model.action_probability(obs, actions=env.action_space.sample()).shape) == 1, \ "Error: not scalar probability"
def _test(self, model): env = gym.make("gym_game2048:game2048-v0", **self._env_kwargs) self._env = DummyVecEnv([lambda: env for i in range(1)]) mean_reward, _ = evaluate_policy(model, self._env, self._eval_episodes, deterministic=True) return mean_reward
def _eval_env( env_name: str, total_timesteps: int, ) -> Tuple[float, float]: # pragma: no cover """Train PPO2 for `total_timesteps` on `env_name` and evaluate returns.""" env = gym.make(env_name) model = PPO2(MlpPolicy, env) model.learn(total_timesteps=total_timesteps) res = evaluate_policy(model, env) assert isinstance(res[0], float) return res
def evaluate_agent(agent_factory: BaseAgentFactory, n_eval_episodes=10): tugger_env = TuggerEnv() tugger_env.reset() agent = agent_factory.create_agent(tugger_env) mean_reward, total_steps = evaluate_policy(agent, tugger_env, n_eval_episodes=n_eval_episodes) print("\nevaluation finished.") print("mean reward: %s" % (mean_reward)) print("average number of gym-steps/episode: %s" % (total_steps / n_eval_episodes))
def run_illegal_move_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'Illegal_move_prevention_training' else: writer = None tb_log_name = None if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) # env = plark_env_illegal_move.PlarkEnvIllegalMove( config_file_path='/Components/plark-game/plark_game/game_config/10x10/balanced.json') env = gym.make('plark-env-illegal-move-v0') if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model env = gym.make('plark-env-illegal-move-v0') model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def run_sonobuoy_training( exp_name,exp_path, basicdate, model_type='PPO2', n_eval_episodes=10, training_intervals=100, max_steps=10000, reward_margin=10, log_to_tb=False, pelican_agent_filepath=False): # set up logging if log_to_tb: writer = SummaryWriter(exp_path) tb_log_name = 'sonobuoy_training' else: writer = None tb_log_name = None env = gym.make('plark-env-v0', panther_agent_filepath='/data/agents/models/PPO2_20200429_073132_panther/') if pelican_agent_filepath: logger.info('Loading agent from file: ' + pelican_agent_filepath) if model_type.lower() == 'dqn': model = DQN.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'ppo2': model = PPO2.load(pelican_agent_filepath) model.set_env(DummyVecEnv([lambda: env])) elif model_type.lower() == 'a2c': model = A2C.load(pelican_agent_filepath) model.set_env(env) elif model_type.lower() == 'acktr': model = ACKTR.load(pelican_agent_filepath) model.set_env(env) else: # Instantiate the env and model model = PPO2('CnnPolicy', env) # Start training train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin) # Evaluate mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('Evaluation finished') logger.info('Mean Reward is ' + str(mean_reward)) logger.info('Number of steps is ' + str(n_steps))
def _search_hparams(env_id, agent, total_steps, trial): if agent == 'ppo2': model_params = ppo2_params(trial) elif agent == 'a2c': model_params = a2c_params(trial) envs, model = _train(env_id, agent, model_params, total_steps, True) mean_reward, _ = evaluate_policy(model, envs, n_eval_episodes=10) envs.close() # Negate the reward because Optuna minimizes lost. return -mean_reward
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: print("----EVALUATION-----") # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) episode_rewards, episode_lengths, episode_success = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) np.savez(self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.success_rate = np.mean(episode_success) # Keep track of the last evaluation, useful for classes that derive from this callback self.last_mean_reward = mean_reward if self.verbose > 0: print("Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format( self.num_timesteps, mean_reward, std_reward)) print("Episode length: {:.2f} +/- {:.2f}".format( mean_ep_length, std_ep_length)) print("Success Rate: {}".format(self.success_rate)) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, 'best_model')) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def main(): # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor model = PPO2(MlpPolicy, env, verbose=1) # before training mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) print("Mean reward: {0} +/- {1}".format(mean_reward, std_reward)) model.learn(total_timesteps=5000) # after training mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) print("Mean reward: {0} +/- {1}".format(mean_reward, std_reward)) # Visualise trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()