def main():
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor

    if not USE_LOADED_MODEL:
        model = PPO2(MlpPolicy, env, verbose=1)

        # before training
        mean_reward, std_reward = evaluate_policy(model,
                                                  env,
                                                  n_eval_episodes=50)
        print("Mean reward: {0}, Std reward: {1}".format(
            mean_reward, std_reward))

        model.learn(total_timesteps=5000)

        # save model
        model.save("cartpole_v1_ppo2")

    loaded_model = PPO2.load("cartpole_v1_ppo2")
    loaded_model.set_env(env)

    # after training
    mean_reward, std_reward = evaluate_policy(loaded_model,
                                              env,
                                              n_eval_episodes=50)
    print("Mean reward: {0} +/- {1}".format(mean_reward, std_reward))
예제 #2
0
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
    else:
        action_noise = None

    model = model_class("MlpPolicy",
                        env,
                        gamma=0.1,
                        seed=0,
                        action_noise=action_noise,
                        buffer_size=int(1e6))
    model.learn(total_timesteps=20000)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
    # Free memory
    del model, env
예제 #3
0
def train():
    def callback(_locals, _globals):
        # Save model
        _locals['self'].save(MODEL_NAME)

    envs = [create_env_headless for _ in range(ENV_COUNT)]
    vec_envs = SubprocVecEnv(envs)
    model = PPO2('CnnPolicy',
                 vec_envs,
                 verbose=1,
                 ent_coef=0.0001,
                 n_steps=256)

    if not os.path.isfile(MODEL_NAME):
        model.save(MODEL_NAME)
        vec_envs.close()
        print("Run again to train")
    else:
        model.learn(total_timesteps=TIMESTEPS, callback=callback)
        model.save(MODEL_NAME)
        vec_envs.close()
        print("Training Done")

        # Evaluation
        print("Evaluation")
        vec_env = create_env_headless()
        vec_env = DummyVecEnv([lambda: vec_env])
        model = PPO2.load(MODEL_NAME)
        print(evaluate_policy(model, vec_env, n_eval_episodes=100))
        print(evaluate_policy(model, vec_env, n_eval_episodes=100))
        vec_env.close()
예제 #4
0
def test_evaluate_policy():
    model = A2C('MlpPolicy', 'Pendulum-v0', seed=0)
    n_steps_per_episode, n_eval_episodes = 200, 2
    model.n_callback_calls = 0

    def dummy_callback(locals_, _globals):
        locals_['model'].n_callback_calls += 1

    _, episode_lengths = evaluate_policy(model,
                                         model.get_env(),
                                         n_eval_episodes,
                                         deterministic=True,
                                         render=False,
                                         callback=dummy_callback,
                                         reward_threshold=None,
                                         return_episode_rewards=True)

    n_steps = sum(episode_lengths)
    assert n_steps == n_steps_per_episode * n_eval_episodes
    assert n_steps == model.n_callback_calls

    # Reaching a mean reward of zero is impossible with the Pendulum env
    with pytest.raises(AssertionError):
        evaluate_policy(model,
                        model.get_env(),
                        n_eval_episodes,
                        reward_threshold=0.0)

    episode_rewards, _ = evaluate_policy(model,
                                         model.get_env(),
                                         n_eval_episodes,
                                         return_episode_rewards=True)
    assert len(episode_rewards) == n_eval_episodes
예제 #5
0
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    n_steps = {SAC: 700, TD3: 500, DDPG: 2000}[model_class]

    kwargs = dict(seed=0, gamma=0.95, buffer_size=1e5)
    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.05 * np.ones(n_actions))
        kwargs["action_noise"] = action_noise

    if model_class == DDPG:
        kwargs["actor_lr"] = 1e-3
        kwargs["batch_size"] = 100

    model = model_class("MlpPolicy", env, **kwargs)
    model.learn(total_timesteps=n_steps)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
    # Free memory
    del model, env
예제 #6
0
def test_gail(tmp_path, expert_env):
    env_id, expert_path, load_from_memory = expert_env
    env = gym.make(env_id)

    traj_data = None
    if load_from_memory:
        traj_data = np.load(expert_path)
        expert_path = None
    dataset = ExpertDataset(traj_data=traj_data,
                            expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(300)
    model.save(str(tmp_path / "GAIL-{}".format(env_id)))
    model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env)
    model.learn(300)

    evaluate_policy(model, env, n_eval_episodes=5)
    del dataset, model
예제 #7
0
def test_identity(model_name):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    :param model_name: (str) Name of the RL model
    """

    env = DummyVecEnv([lambda: IdentityEnv(18, 18, 60)])

    model = LEARN_FUNC_DICT[model_name](env)
    print('已经训练结束')
    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=None)
    obs = env.reset()
    assert model.action_probability(obs).shape == (
        1,
        18,
    ), "Error: action_probability not returning correct shape"
    action = env.action_space.sample()
    action_prob = model.action_probability(obs, actions=action)
    assert np.prod(action_prob.shape) == 1, "Error: not scalar probability"
    action_logprob = model.action_probability(obs, actions=action, logp=True)
    assert np.allclose(action_prob, np.exp(action_logprob)), (
        action_prob,
        action_logprob,
    )

    # Free memory

    del model, env
예제 #8
0
    def eval_model(model, test_env_id):
        global eval_step, THRESHOLD
        test_success, curriculum_success = True, True
        performance_data[eval_step] = {}
        for env_id in env_ids:
            write_out(
                "[MODEL EVAL]\tTesting learner on env: {}".format(env_id))
            env, eval_env, eval_callback = init_env(env_id)

            fresh_model = A2C(CnnPolicy, env, verbose=verbose)
            fresh_model.learn(total_timesteps=max_steps,
                              callback=eval_callback)

            fresh_mean, fresh_std = evaluate_policy(fresh_model,
                                                    eval_env,
                                                    n_eval_episodes=100)
            model_mean, model_std = evaluate_policy(model,
                                                    eval_env,
                                                    n_eval_episodes=100)
            performance_data[eval_step][env_id] = {
                'baseline_mean': fresh_mean,
                'baseline_std': fresh_std,
                'model_mean': model_mean,
                'model_std': model_std,
                'baseline_training_steps': max_steps,
                'eval_episodes': 100
            }
            write_out(
                "[MODEL EVAL: LEARNER] \t env_id: {}, Mean Reward: {}, std_dev: {}"
                .format(env_id, model_mean, model_std))
            write_out(
                "[MODEL EVAL: BASELINE]\t env_id: {}, Mean Reward: {}, std_dev: {}"
                .format(env_id, fresh_mean, fresh_std))

            pass_test = round(model_mean - model_std, 3) >= round(
                fresh_mean - fresh_std, 3)
            diff = abs(
                round(model_mean - model_std, 3) -
                round(fresh_mean - fresh_std, 3))
            if pass_test:
                write_out(
                    "[TEST RESULT]\tmodel out-performs fresh model for env: {}, diff: {}"
                    .format(env_id, diff))
            else:
                write_out(
                    "[TEST RESULT]\tmodel DID NOT out-perform fresh model for env: {}, diff: {}"
                    .format(env_id, diff))
                if env_id == test_env_id:
                    test_success = False

        curriculum_success = sum([
            performance_data[eval_step][env_id]['baseline_mean'] > THRESHOLD
            for env_id in env_ids
        ]) == len(env_ids)
        eval_step += 1
        return test_success, curriculum_success
예제 #9
0
def evaluate():
    vec_env = create_env_headless()
    vec_env = DummyVecEnv([lambda: vec_env])
    model = PPO2.load(MODEL_NAME)
    print("After Training evaluation")
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    print(evaluate_policy(model, vec_env, n_eval_episodes=1000))
    vec_env.close()
예제 #10
0
def check_shape(make_env, model_class, shape_1, shape_2):
    model = model_class(policy="MlpPolicy", env=DummyVecEnv([make_env]))

    env0 = make_env()
    env1 = DummyVecEnv([make_env])

    for env, expected_shape in [(env0, shape_1), (env1, shape_2)]:

        def callback(locals_, _globals):
            assert np.array(locals_['action']).shape == expected_shape

        evaluate_policy(model, env, n_eval_episodes=5, callback=callback)
def main():
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    if not USE_LOADED_MODEL:
        model = ACKTR('MlpPolicy', env, verbose=1)

        # Multiprocessed RL Training
        start_time = time.time()
        model.learn(total_timesteps=n_timesteps, log_interval=10)
        total_time_multi = time.time() - start_time

        model.save("cartpole_v1_acktr")

    loaded_model = ACKTR.load("cartpole_v1_acktr")
    loaded_model.set_env(env)

    # Single Process RL Training
    single_process_model = ACKTR('MlpPolicy', env_id, verbose=1)
    start_time = time.time()
    single_process_model.learn(n_timesteps)
    total_time_single = time.time() - start_time

    print("Single-process: {0}s, Multi-process: {1}s".format(
        total_time_single, total_time_multi))

    # create separate clean environment for evaluation
    eval_env = gym.make(env_id)
    mean_reward, std_reward = evaluate_policy(loaded_model,
                                              eval_env,
                                              n_eval_episodes=10)
    print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')
def LunarLander_v2_DQN(): #TODO : 报错
    # Create environment
    env = gym.make('LunarLander-v2')

    # Instantiate the agent
    model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1)
    # Train the agent
    model.learn(total_timesteps=100000)
    # Save the agent
    model.save("dqn_lunar")
    del model  # delete trained model to demonstrate loading

    # Load the trained agent
    model = DQN.load("dqn_lunar")

    # Evaluate the agent
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    print(mean_reward, std_reward)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
예제 #13
0
def train():
    env = LunarLander()
    # env = gym.make('LunarLander-v2')

    # Instantiate the agent
    model = PPO2(
        'MlpPolicy',
        env,
        learning_rate=0.001,
        # prioritized_replay=True,
        verbose=1)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)
    # Train the agent
    timesteps = os.environ.get('TIMESTEPS')
    timesteps = int(float(timesteps)) if timesteps is not None else 1e6
    print('timesteps %s' % timesteps)
    model.learn(total_timesteps=int(timesteps), log_interval=10)
    # Save the agent
    model.save("trained_models/latest")

    now = datetime.now()
    dt_string = now.strftime("%Y-%m-%d_%H-%M-%S")
    # model.save("trained_models/lunar_climber-%s" % dt_string)
    model.save("/opt/ml/model/lunar_climber-%s" % dt_string)

    # Plot training progress
    # plt.plot(env.all_rewards)
    # plt.ylabel('Reward')
    # plt.xlabel('Timesteps')
    # plt.savefig('figures/stats-%s.png' % dt_string)

    print("Model trained!")
예제 #14
0
    def run_train(self, load_rand_state: bool = False) -> TrainResult:
        if load_rand_state:
            load_numpy_rand_state(self.rand_state_path)

        self.print("=" * 60)
        self.print(f"MODEL NAME:\t{self.model_name}")
        self.print("=" * 60)

        callbacks: List[BaseCallback] = self.create_callbacks(self.eval_env)

        self.model.learn(self.total_train_steps, tb_log_name=self.model_name, callback=callbacks)

        if self.use_eval_callback:
            self.model = type(self.model).load(os.path.join('non_learning_io_logs', self.model_name, "best_model.zip"))

        eval_pbar = tqdm(desc="Evaluation Episodes Progress", total=self.eval_episodes, file=sys.stdout)

        eval_callback = EvalPbarCallback(eval_pbar, self.verbose)

        mean_reward: float
        std_reward: float
        mean_reward, std_reward = evaluate_policy(
            self.model, self.eval_env, self.eval_episodes,
            callback=eval_callback,
        )
        eval_pbar.close()

        result = TrainResult(mean_reward, std_reward, eval_callback.num_successes, self.eval_episodes)
        self.print(result.res_msg())

        return result
예제 #15
0
    def _eval_policy(self,
                     eval_freq,
                     eval_env,
                     n_eval_episodes,
                     timesteps_since_eval,
                     deterministic=True):
        """
        Evaluate the current policy on a test environment.

        :param eval_env: (gym.Env) Environment that will be used to evaluate the agent
        :param eval_freq: (int) Evaluate the agent every `eval_freq` timesteps (this may vary a little)
        :param n_eval_episodes: (int) Number of episode to evaluate the agent
        :parma timesteps_since_eval: (int) Number of timesteps since last evaluation
        :param deterministic: (bool) Whether to use deterministic or stochastic actions
        :return: (int) Number of timesteps since last evaluation
        """
        if 0 < eval_freq <= timesteps_since_eval and eval_env is not None:
            timesteps_since_eval %= eval_freq
            # Synchronise the normalization stats if needed
            sync_envs_normalization(self.env, eval_env)
            mean_reward, std_reward = evaluate_policy(
                self, eval_env, n_eval_episodes, deterministic=deterministic)
            if self.verbose > 0:
                print("Eval num_timesteps={}, "
                      "episode_reward={:.2f} +/- {:.2f}".format(
                          self.num_timesteps, mean_reward, std_reward))
                print("FPS: {:.2f}".format(self.num_timesteps /
                                           (time.time() - self.start_time)))
        return timesteps_since_eval
예제 #16
0
  def _on_step(self, plot=True) -> bool:
    """Evaluate the current policy for self.eval_episodes, then take a render
    and report all stats to W&B

    Args:
      plot: Enable matplotlib plotting behavior. Should be set to True unless 
        testing. Defaults to True.

    Returns:
      True, as per API requirements
    """
    mean_rewards, std_rewards = evaluate_policy(
      self.model, self.env, n_eval_episodes=self.eval_episodes)
    
    images = []
    rewards = []
    actions = []
    obses = []
    step_cnt = 0
    done, state = False, None
    obs = self.env.reset()
    while not done:
      if step_cnt % self.render_freq == 0:
        images.append(self.env.render(mode='rgb_array'))

      action, state = self.model.predict(obs, state=state, deterministic=True)
      obs, reward, done, _ = self.env.step(action)

      rewards.append(reward)
      actions.append(action)
      obses.append(obs)
      step_cnt += 1

    render = np.array(images)
    render = np.transpose(render, (0, 3, 1, 2))

    actions = np.array(actions).flatten()
    observes = np.array(obses).flatten()

    rewards = np.array(rewards)
    if plot:
      plt.clf()
      plt.plot(np.arange(len(rewards)), rewards)
      plt.xlabel('timesteps')
      plt.ylabel('rewards')
      plt.title('Timestep {}'.format(self.num_timesteps))

    wandb.log({
      'test_reward_mean': mean_rewards, 
      'test_reward_std': std_rewards,
      'render': wandb.Video(render, format='gif', fps=self.fps),
      'global_step': self.num_timesteps,
      'evaluations': self.n_calls,
      'reward_distribution': wandb.Histogram(rewards),
      'action_distribution': wandb.Histogram(actions),
      'observation_distribution': wandb.Histogram(observes),
      'reward_vs_time': plot and wandb.Image(plt),
    }, step=self.num_timesteps)

    return True
예제 #17
0
def fed_and_eval(base_index, w):
    base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0",
                            n_envs=1,
                            seed=seed)
    base_agent = ACKTR.load(
        f"./base_agent/{subenv_dict[base_index]}/model.zip")
    base_parameter_dict = base_agent.get_parameters()

    sub_model_parameters = []
    for subenv in subenv_dict.values():
        client_policy = ACKTR.load(
            f"./base{base_index}_client_model/{subenv}/policy.zip")
        sub_model_parameters.append(client_policy.get_parameters())

    aligned_agent = base_agent
    base_parameter_dict = aligned_agent.get_parameters()

    model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha)

    aligned_agent.load_parameters(base_parameter_dict)
    avg_reward, reward_std = evaluate_policy(aligned_agent,
                                             base_env,
                                             n_eval_episodes=100)

    print(f"base {base_index}, weight {w} done")
    return (avg_reward, reward_std)
예제 #18
0
    def __call__(self, locals_, globals_):
        """
        :param locals_: (dict)
        :param globals_: (dict)
        :return: (bool)
        """
        self.n_calls += 1
        self.model = locals_['self']
        self.num_timesteps = self.model.num_timesteps

        if self.n_calls % self.eval_freq == 0:
            episode_rewards, _ = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                deterministic=self.deterministic,
                return_episode_rewards=True)

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            if self.verbose > 0:
                print("Eval num_timesteps={}, "
                      "episode_reward={:.2f} +/- {:.2f}".format(
                          self.num_timesteps, mean_reward, std_reward))

            if mean_reward > self.best_mean_reward:
                if self.best_model_save_path is not None:
                    print("Saving best model")
                    self.model.save(self.best_model_save_path)
                self.best_mean_reward = mean_reward

        return True
def test_cnn_lstm_policy(request, policy):
    model_fname = './test_model_{}.zip'.format(request.node.name)

    try:
        env = make_env(0)
        model = PPO2(policy, env, nminibatches=1)
        model.learn(total_timesteps=15)
        env = model.get_env()
        evaluate_policy(model, env, n_eval_episodes=5)
        # saving
        model.save(model_fname)
        del model, env
        # loading
        _ = PPO2.load(model_fname, policy=policy)

    finally:
        if os.path.exists(model_fname):
            os.remove(model_fname)
예제 #20
0
def retrain(mean_reward, target_reward, count, env, model):
    if mean_reward < target_reward:
        count = count + 1
        model.learn(50)
        mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
        if mean_reward < target_reward:
            retrain(mean_reward, target_reward, count, env, model)

    return True
예제 #21
0
 def run(self, n_eval=5, verbose=True):  # TODO add parameter to build ensemble
     for algo in self.algos_list:
         for env_name, env in self.envs.items():
             mean_reward, std_reward = evaluate_policy(self.models[env_name][algo], env=env,
                                                       n_eval_episodes=n_eval)
             # rewards[env_name][algo] = (mean_reward, std_reward)
             self.rewards[env_name][algo] = (mean_reward, std_reward)
             if verbose:
                 print("============ Evaluación finalizada de " + algo + " en " + env_name + " ============")
                 print(f"mean_reward={mean_reward}\n\n")
예제 #22
0
def test_identity_multibinary(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multibinary action space

    :param model_class: (BaseRLModel) A RL Model
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)])

    model = model_class("MlpPolicy", env)
    model.learn(total_timesteps=1000)
    evaluate_policy(model, env, n_eval_episodes=5)
    obs = env.reset()

    assert model.action_probability(obs).shape == (1, 10), \
        "Error: action_probability not returning correct shape"
    assert np.prod(model.action_probability(obs, actions=env.action_space.sample()).shape) == 1, \
        "Error: not scalar probability"
예제 #23
0
    def _test(self, model):

        env = gym.make("gym_game2048:game2048-v0", **self._env_kwargs)
        self._env = DummyVecEnv([lambda: env for i in range(1)])

        mean_reward, _ = evaluate_policy(model,
                                         self._env,
                                         self._eval_episodes,
                                         deterministic=True)

        return mean_reward
예제 #24
0
def _eval_env(
    env_name: str,
    total_timesteps: int,
) -> Tuple[float, float]:  # pragma: no cover
    """Train PPO2 for `total_timesteps` on `env_name` and evaluate returns."""
    env = gym.make(env_name)
    model = PPO2(MlpPolicy, env)
    model.learn(total_timesteps=total_timesteps)
    res = evaluate_policy(model, env)
    assert isinstance(res[0], float)
    return res
예제 #25
0
def evaluate_agent(agent_factory: BaseAgentFactory, n_eval_episodes=10):
    tugger_env = TuggerEnv()
    tugger_env.reset()
    agent = agent_factory.create_agent(tugger_env)
    mean_reward, total_steps = evaluate_policy(agent,
                                               tugger_env,
                                               n_eval_episodes=n_eval_episodes)
    print("\nevaluation finished.")
    print("mean reward: %s" % (mean_reward))
    print("average number of gym-steps/episode: %s" %
          (total_steps / n_eval_episodes))
예제 #26
0
def run_illegal_move_training(
                    exp_name,exp_path,
                    basicdate,
                    model_type='PPO2',
                    n_eval_episodes=10,
                    training_intervals=100,
                    max_steps=10000,
                    reward_margin=10,
                    log_to_tb=False,
                    pelican_agent_filepath=False):
    
       # set up logging 
    if log_to_tb:
        writer = SummaryWriter(exp_path)
        tb_log_name = 'Illegal_move_prevention_training'
    else:
        writer = None
        tb_log_name = None
    
    if pelican_agent_filepath:
        logger.info('Loading agent from file: ' + pelican_agent_filepath)
        # env = plark_env_illegal_move.PlarkEnvIllegalMove( config_file_path='/Components/plark-game/plark_game/game_config/10x10/balanced.json')
        env = gym.make('plark-env-illegal-move-v0')

        if model_type.lower() == 'dqn':
            model = DQN.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'ppo2':
            model = PPO2.load(pelican_agent_filepath)
            model.set_env(DummyVecEnv([lambda: env]))
            
        elif model_type.lower() == 'a2c':
            model = A2C.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'acktr':
            model = ACKTR.load(pelican_agent_filepath)
            model.set_env(env)

    else:   
        # Instantiate the env and model
        env = gym.make('plark-env-illegal-move-v0')
        model = PPO2('CnnPolicy', env)

    # Start training 
    train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin)
                
    # Evaluate
    mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
    logger.info('Evaluation finished')
    logger.info('Mean Reward is ' + str(mean_reward))
    logger.info('Number of steps is ' + str(n_steps))
예제 #27
0
def run_sonobuoy_training(
                    exp_name,exp_path,
                    basicdate,
                    model_type='PPO2',
                    n_eval_episodes=10,
                    training_intervals=100,
                    max_steps=10000,
                    reward_margin=10,
                    log_to_tb=False,
                    pelican_agent_filepath=False):

    # set up logging 
    if log_to_tb:
        writer = SummaryWriter(exp_path)
        tb_log_name = 'sonobuoy_training'
    else:
        writer = None
        tb_log_name = None

        
    env = gym.make('plark-env-v0', panther_agent_filepath='/data/agents/models/PPO2_20200429_073132_panther/')
    
    if pelican_agent_filepath:
        logger.info('Loading agent from file: ' + pelican_agent_filepath)

        if model_type.lower() == 'dqn':
            model = DQN.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'ppo2':
            model = PPO2.load(pelican_agent_filepath)
            model.set_env(DummyVecEnv([lambda: env]))
            
        elif model_type.lower() == 'a2c':
            model = A2C.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'acktr':
            model = ACKTR.load(pelican_agent_filepath)
            model.set_env(env)

    else:   
        # Instantiate the env and model
        model = PPO2('CnnPolicy', env)

    # Start training 
    train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin)
                
    # Evaluate
    mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
    logger.info('Evaluation finished')
    logger.info('Mean Reward is ' + str(mean_reward))
    logger.info('Number of steps is ' + str(n_steps))
예제 #28
0
def _search_hparams(env_id, agent, total_steps, trial):
    if agent == 'ppo2':
        model_params = ppo2_params(trial)
    elif agent == 'a2c':
        model_params = a2c_params(trial)

    envs, model = _train(env_id, agent, model_params, total_steps, True)
    mean_reward, _ = evaluate_policy(model, envs, n_eval_episodes=10)

    envs.close()
    # Negate the reward because Optuna minimizes lost.
    return -mean_reward
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            print("----EVALUATION-----")
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            episode_rewards, episode_lengths, episode_success = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True)

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)
                np.savez(self.log_path,
                         timesteps=self.evaluations_timesteps,
                         results=self.evaluations_results,
                         ep_lengths=self.evaluations_length)

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.success_rate = np.mean(episode_success)
            # Keep track of the last evaluation, useful for classes that derive from this callback
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print("Eval num_timesteps={}, "
                      "episode_reward={:.2f} +/- {:.2f}".format(
                          self.num_timesteps, mean_reward, std_reward))
                print("Episode length: {:.2f} +/- {:.2f}".format(
                    mean_ep_length, std_ep_length))
                print("Success Rate: {}".format(self.success_rate))

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, 'best_model'))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True
def main():
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor

    model = PPO2(MlpPolicy, env, verbose=1)

    # before training
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
    print("Mean reward: {0} +/- {1}".format(mean_reward, std_reward))

    model.learn(total_timesteps=5000)

    # after training
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
    print("Mean reward: {0} +/- {1}".format(mean_reward, std_reward))

    # Visualise trained agent
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()