示例#1
0
    def run(self):
        n = self.env.n
        i = 1 if self.env.obs_type == 'visual' else 0
        state = [np.full((n, 0), []), np.full((n, 0), [])]
        sma = SMA(100)
        total_step = 0
        episode = 0

        while True:
            if episode % self.pull_interval:
                self.model.set_worker_params(self.callback_func())
                logger.info('pull parameters from success.')
            episode += 1
            self.model.reset()
            state[i] = self.env.reset()
            dones_flag = np.zeros(self.env.n)
            step = 0
            rets = np.zeros(self.env.n)
            last_done_step = -1
            while True:
                step += 1
                # env.render(record=False)
                action = self.model.choose_action(s=state[0], visual_s=state[1])
                _, reward, done, info, state[i] = self.env.step(action)
                rets += (1 - dones_flag) * reward
                dones_flag = np.sign(dones_flag + done)
                self.model.partial_reset(done)
                total_step += 1
                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    break

                if step >= 200:
                    break

            sma.update(rets)
            self.model.writer_summary(
                episode,
                reward_mean=rets.mean(),
                reward_min=rets.min(),
                reward_max=rets.max(),
                step=last_done_step,
                **sma.rs
            )
            logger.info(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}')
            time.sleep(self.episode_sleep)
示例#2
0
    def evaluate(env, model):
        n = env.n
        i = 1 if env.obs_type == 'visual' else 0
        state = [np.full((n, 0), []), np.full((n, 0), [])]
        sma = SMA(100)
        total_step = 0
        episode = 0

        while True:
            episode += 1
            model.reset()
            state[i] = env.reset()
            dones_flag = np.zeros(env.n)
            step = 0
            rets = np.zeros(env.n)
            last_done_step = -1
            while True:
                step += 1
                # env.render(record=False)
                action = model.choose_action(s=state[0], visual_s=state[1])
                _, reward, done, info, state[i] = env.step(action)
                rets += (1 - dones_flag) * reward
                dones_flag = np.sign(dones_flag + done)
                model.partial_reset(done)
                total_step += 1
                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    break

                if step >= 200:
                    break

            sma.update(rets)
            model.writer_summary(
                episode,
                reward_mean=rets.mean(),
                reward_min=rets.min(),
                reward_max=rets.max(),
                step=last_done_step,
                **sma.rs
            )
            print(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}')
            time.sleep(5)
示例#3
0
文件: gym.py 项目: ncepuwwy97/RLs
def gym_train(env, model, print_func: Callable[[str], None],
              begin_train_step: int, begin_frame_step: int, begin_episode: int,
              render: bool, render_episode: int, save_frequency: int,
              max_step_per_episode: int, max_train_episode: int,
              eval_while_train: bool, max_eval_episode: int,
              off_policy_step_eval_episodes: int,
              off_policy_train_interval: int, policy_mode: str,
              moving_average_episode: int, add_noise2buffer: bool,
              add_noise2buffer_episode_interval: int,
              add_noise2buffer_steps: int, off_policy_eval_interval: int,
              max_train_step: int, max_frame_step: int) -> NoReturn:
    """
    TODO: Annotation
    """

    i, state, new_state = init_variables(env)
    sma = SMA(moving_average_episode)
    frame_step = begin_frame_step
    train_step = begin_train_step
    total_step = 0

    for episode in range(begin_episode, max_train_episode):
        model.reset()
        state[i] = env.reset()
        dones_flag = np.zeros(env.n)
        step = 0
        rets = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(s=state[0], visual_s=state[1])
            new_state[i], reward, done, info, correct_new_state = env.step(
                action)
            rets += (1 - dones_flag) * reward
            dones_flag = np.sign(dones_flag + done)
            model.store_data(s=state[0],
                             visual_s=state[1],
                             a=action,
                             r=reward,
                             s_=new_state[0],
                             visual_s_=new_state[1],
                             done=done)
            model.partial_reset(done)
            state[i] = correct_new_state

            if policy_mode == 'off-policy':
                if total_step % off_policy_train_interval == 0:
                    model.learn(episode=episode, train_step=train_step)
                    train_step += 1
                if train_step % save_frequency == 0:
                    model.save_checkpoint(train_step=train_step,
                                          episode=episode,
                                          frame_step=frame_step)
                if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0:
                    gym_step_eval(deepcopy(env), model, train_step,
                                  off_policy_step_eval_episodes,
                                  max_step_per_episode)

            frame_step += env.n
            total_step += 1
            if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        sma.update(rets)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, train_step=train_step)
            train_step += 1
            if train_step % save_frequency == 0:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
        model.writer_summary(episode,
                             reward_mean=rets.mean(),
                             reward_min=rets.min(),
                             reward_max=rets.max(),
                             step=last_done_step,
                             **sma.rs)
        print_func('-' * 40, out_time=True)
        print_func(
            f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(rets, 2)}'
        )

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_no_op(env,
                      model,
                      pre_fill_steps=add_noise2buffer_steps,
                      print_func=print_func,
                      prefill_choose=False,
                      desc='adding noise')

        if eval_while_train and env.reward_threshold is not None:
            if rets.max() >= env.reward_threshold:
                print_func(
                    f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                )
                gym_evaluate(env, model, max_step_per_episode,
                             max_eval_episode, print_func)
示例#4
0
def gym_train(env, model,
              print_func: Callable[[str], None],
              begin_train_step: int,
              begin_frame_step: int,
              begin_episode: int,
              render: bool,
              render_episode: int,
              save_frequency: int,
              max_step_per_episode: int,
              max_train_episode: int,
              eval_while_train: bool,
              max_eval_episode: int,
              off_policy_step_eval_episodes: int,
              off_policy_train_interval: int,
              policy_mode: str,
              moving_average_episode: int,
              add_noise2buffer: bool,
              add_noise2buffer_episode_interval: int,
              add_noise2buffer_steps: int,
              off_policy_eval_interval: int,
              max_train_step: int,
              max_frame_step: int) -> NoReturn:
    """
    TODO: Annotation
    """

    sma = SMA(moving_average_episode)
    frame_step = begin_frame_step
    train_step = begin_train_step
    total_step = 0

    for episode in range(begin_episode, max_train_episode):
        model.reset()
        obs = env.reset()
        dones_flag = np.zeros(env.n)
        step = 0
        returns = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(obs=obs)
            ret = env.step(action)
            model.store_data(BatchExperiences(obs=obs,
                                              action=action,
                                              reward=ret.reward[:, np.newaxis],  # [B, ] => [B, 1]
                                              obs_=ret.obs,
                                              done=ret.done[:, np.newaxis]))
            model.partial_reset(ret.done)
            returns += (1 - dones_flag) * ret.reward
            dones_flag = np.sign(dones_flag + ret.done)
            obs = ret.corrected_obs

            if policy_mode == 'off-policy':
                if total_step % off_policy_train_interval == 0:
                    model.learn(episode=episode, train_step=train_step)
                    train_step += 1
                if train_step % save_frequency == 0:
                    model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
                if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0:
                    gym_step_eval(deepcopy(env), model, train_step, off_policy_step_eval_episodes, max_step_per_episode)

            frame_step += env.n
            total_step += 1
            if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step:
                model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
                logger.info(f'End Training, learn step: {train_step}, frame_step: {frame_step}')
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        sma.update(returns)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, train_step=train_step)
            train_step += 1
            if train_step % save_frequency == 0:
                model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
        model.writer_summary(
            episode,
            reward_mean=returns.mean(),
            reward_min=returns.min(),
            reward_max=returns.max(),
            step=last_done_step,
            **sma.rs
        )
        print_func(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(returns, 2)}', out_time=True)

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, desc='adding noise')

        if eval_while_train and env.reward_threshold is not None:
            if returns.max() >= env.reward_threshold:
                print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------')
                gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)