def run(self): n = self.env.n i = 1 if self.env.obs_type == 'visual' else 0 state = [np.full((n, 0), []), np.full((n, 0), [])] sma = SMA(100) total_step = 0 episode = 0 while True: if episode % self.pull_interval: self.model.set_worker_params(self.callback_func()) logger.info('pull parameters from success.') episode += 1 self.model.reset() state[i] = self.env.reset() dones_flag = np.zeros(self.env.n) step = 0 rets = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 # env.render(record=False) action = self.model.choose_action(s=state[0], visual_s=state[1]) _, reward, done, info, state[i] = self.env.step(action) rets += (1 - dones_flag) * reward dones_flag = np.sign(dones_flag + done) self.model.partial_reset(done) total_step += 1 if all(dones_flag): if last_done_step == -1: last_done_step = step break if step >= 200: break sma.update(rets) self.model.writer_summary( episode, reward_mean=rets.mean(), reward_min=rets.min(), reward_max=rets.max(), step=last_done_step, **sma.rs ) logger.info(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}') time.sleep(self.episode_sleep)
def evaluate(env, model): n = env.n i = 1 if env.obs_type == 'visual' else 0 state = [np.full((n, 0), []), np.full((n, 0), [])] sma = SMA(100) total_step = 0 episode = 0 while True: episode += 1 model.reset() state[i] = env.reset() dones_flag = np.zeros(env.n) step = 0 rets = np.zeros(env.n) last_done_step = -1 while True: step += 1 # env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) _, reward, done, info, state[i] = env.step(action) rets += (1 - dones_flag) * reward dones_flag = np.sign(dones_flag + done) model.partial_reset(done) total_step += 1 if all(dones_flag): if last_done_step == -1: last_done_step = step break if step >= 200: break sma.update(rets) model.writer_summary( episode, reward_mean=rets.mean(), reward_min=rets.min(), reward_max=rets.max(), step=last_done_step, **sma.rs ) print(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}') time.sleep(5)
def gym_train(env, model, print_func: Callable[[str], None], begin_train_step: int, begin_frame_step: int, begin_episode: int, render: bool, render_episode: int, save_frequency: int, max_step_per_episode: int, max_train_episode: int, eval_while_train: bool, max_eval_episode: int, off_policy_step_eval_episodes: int, off_policy_train_interval: int, policy_mode: str, moving_average_episode: int, add_noise2buffer: bool, add_noise2buffer_episode_interval: int, add_noise2buffer_steps: int, off_policy_eval_interval: int, max_train_step: int, max_frame_step: int) -> NoReturn: """ TODO: Annotation """ i, state, new_state = init_variables(env) sma = SMA(moving_average_episode) frame_step = begin_frame_step train_step = begin_train_step total_step = 0 for episode in range(begin_episode, max_train_episode): model.reset() state[i] = env.reset() dones_flag = np.zeros(env.n) step = 0 rets = np.zeros(env.n) last_done_step = -1 while True: step += 1 if render or episode > render_episode: env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info, correct_new_state = env.step( action) rets += (1 - dones_flag) * reward dones_flag = np.sign(dones_flag + done) model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) model.partial_reset(done) state[i] = correct_new_state if policy_mode == 'off-policy': if total_step % off_policy_train_interval == 0: model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0: gym_step_eval(deepcopy(env), model, train_step, off_policy_step_eval_episodes, max_step_per_episode) frame_step += env.n total_step += 1 if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) logger.info( f'End Training, learn step: {train_step}, frame_step: {frame_step}' ) return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break sma.update(rets) if policy_mode == 'on-policy': model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) model.writer_summary(episode, reward_mean=rets.mean(), reward_min=rets.min(), reward_max=rets.max(), step=last_done_step, **sma.rs) print_func('-' * 40, out_time=True) print_func( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(rets, 2)}' ) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, print_func=print_func, prefill_choose=False, desc='adding noise') if eval_while_train and env.reward_threshold is not None: if rets.max() >= env.reward_threshold: print_func( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)
def gym_train(env, model, print_func: Callable[[str], None], begin_train_step: int, begin_frame_step: int, begin_episode: int, render: bool, render_episode: int, save_frequency: int, max_step_per_episode: int, max_train_episode: int, eval_while_train: bool, max_eval_episode: int, off_policy_step_eval_episodes: int, off_policy_train_interval: int, policy_mode: str, moving_average_episode: int, add_noise2buffer: bool, add_noise2buffer_episode_interval: int, add_noise2buffer_steps: int, off_policy_eval_interval: int, max_train_step: int, max_frame_step: int) -> NoReturn: """ TODO: Annotation """ sma = SMA(moving_average_episode) frame_step = begin_frame_step train_step = begin_train_step total_step = 0 for episode in range(begin_episode, max_train_episode): model.reset() obs = env.reset() dones_flag = np.zeros(env.n) step = 0 returns = np.zeros(env.n) last_done_step = -1 while True: step += 1 if render or episode > render_episode: env.render(record=False) action = model.choose_action(obs=obs) ret = env.step(action) model.store_data(BatchExperiences(obs=obs, action=action, reward=ret.reward[:, np.newaxis], # [B, ] => [B, 1] obs_=ret.obs, done=ret.done[:, np.newaxis])) model.partial_reset(ret.done) returns += (1 - dones_flag) * ret.reward dones_flag = np.sign(dones_flag + ret.done) obs = ret.corrected_obs if policy_mode == 'off-policy': if total_step % off_policy_train_interval == 0: model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0: gym_step_eval(deepcopy(env), model, train_step, off_policy_step_eval_episodes, max_step_per_episode) frame_step += env.n total_step += 1 if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) logger.info(f'End Training, learn step: {train_step}, frame_step: {frame_step}') return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break sma.update(returns) if policy_mode == 'on-policy': model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) model.writer_summary( episode, reward_mean=returns.mean(), reward_min=returns.min(), reward_max=returns.max(), step=last_done_step, **sma.rs ) print_func(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(returns, 2)}', out_time=True) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, desc='adding noise') if eval_while_train and env.reward_threshold is not None: if returns.max() >= env.reward_threshold: print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------') gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)