def train(self, n_episodes=100, annealing_episodes=None, done_reward=None, every_episode=None): reward_history = [] reward_averaged = [] step = 0 alpha = self.alpha eps = self.epsilon annealing_episodes = annealing_episodes or n_episodes eps_drop = (self.epsilon - self.epsilon_final) / annealing_episodes for n_episode in range(n_episodes): ob = self.env.reset() done = False reward = 0. while not done: a = self.act(ob, eps) new_ob, r, done, info = self.env.step(a) if done and done_reward is not None: r = done_reward self._update_q_value(Transition(ob, a, r, new_ob, done)) step += 1 reward += r ob = new_ob reward_history.append(reward) reward_averaged.append(np.average(reward_history[-50:])) alpha *= self.alpha_decay if eps > self.epsilon_final: eps -= eps_drop if every_episode is not None and n_episode % every_episode == 0: # Report the performance every 100 steps print( "[episode:{}|step:{}] best:{} avg:{:.4f}|{} alpha:{:.4f} eps:{:.4f} Qsize:{}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], alpha, eps, len(self.Q))) print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}". format(len(reward_history), np.max(reward_history), np.mean(reward_history))) plot_learning_curve(self.name, { 'reward': reward_history, 'reward_avg50': reward_averaged }, xlabel='episode')
def train(self, config: TrainConfig): reward_history = [] reward_averaged = [] step = 0 alpha = config.alpha eps = config.epsilon warmup_episodes = config.warmup_episodes or config.n_episodes eps_drop = (config.epsilon - config.epsilon_final) / warmup_episodes for n_episode in range(config.n_episodes): ob = self.env.reset() done = False reward = 0. while not done: a = self.act(ob, eps) new_ob, r, done, info = self.env.step(a) if done and config.done_reward is not None: r += config.done_reward self._update_q_value(Transition(ob, a, r, new_ob, done), alpha) step += 1 reward += r ob = new_ob reward_history.append(reward) reward_averaged.append(np.average(reward_history[-50:])) alpha *= config.alpha_decay if eps > config.epsilon_final: eps = max(config.epsilon_final, eps - eps_drop) if config.log_every_episode is not None and n_episode % config.log_every_episode == 0: # Report the performance every 100 steps print( "[episode:{}|step:{}] best:{} avg:{:.4f} alpha:{:.4f} eps:{:.4f} Qsize:{}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), alpha, eps, len(self.Q))) print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}". format(len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = {'reward': reward_history, 'reward_avg50': reward_averaged} plot_learning_curve(self.name, data_dict, xlabel='episode') self.env.render()
def train(self, config: TrainConfig): # Construct the replay memory buffer. buffer = ReplayMemory(tuple_class=Transition) step = 0 n_episode = 0 episode_reward = 0. episode_step = 0 reward_history = [] reward_averaged = [] eps = config.epsilon eps_drop_per_step = (eps - config.epsilon_final) / config.warmup_steps print("decrease `epsilon` per step:", eps_drop_per_step) env = self.env ob = env.reset() done = False while step < config.n_steps: while not done: a = self.act(ob, eps) ob_next, r, done, _ = env.step(a) step += 1 episode_step += 1 episode_reward += r buffer.add(Transition(ob, a, r, ob_next, float(done))) ob = ob_next if eps > config.epsilon_final: eps = max(config.epsilon_final, eps - eps_drop_per_step) if reward_history and config.log_every_step and step % config.log_every_step == 0: # Report the performance every `log_every_step` steps print( "[episodes:{}/step:{}], best(reward):{:.2f}, avg(reward):{:.2f}, eps:{:.4f}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), eps)) # self.save_checkpoint(step=step) if buffer.size >= config.batch_size: batch = buffer.pop(config.batch_size) _, q_loss, mu_loss, summ_str = self.sess.run( [ self.train_ops, self.Q_loss, self.mu_loss, self.merged_summary ], feed_dict={ self.lr_a: config.lr_a, self.lr_c: config.lr_c, self.done: batch['done'], self.s: batch['s'], self.a: batch['a'], self.r: batch['r'], self.s_next: batch['s_next'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.update_target_net(tau=config.tau) self.writer.add_summary(summ_str, step) # one trajectory is complete. n_episode += 1 ob = env.reset() done = False reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_step = 0 episode_reward = 0. self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): buffer = ReplayMemory(tuple_class=Transition) step = 0 episode_reward = 0. reward_history = [] reward_averaged = [] lr_c = config.lr_c lr_a = config.lr_a eps = config.epsilon warmup_episodes = config.warmup_episodes or config.n_episodes eps_drop = (eps - config.epsilon_final) / warmup_episodes print("Decrease epsilon per step:", eps_drop) for n_episode in range(config.n_episodes): ob = self.env.reset() self.act(ob, eps) done = False while not done: a = self.act(ob, eps) ob_next, r, done, info = self.env.step(a) step += 1 episode_reward += r record = Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(ob_next), done) buffer.add(record) ob = ob_next while buffer.size >= config.batch_size: batch = buffer.pop(config.batch_size) _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr_c: lr_c, self.lr_a: lr_a, self.s: batch['s'], self.a: batch['a'], self.r: batch['r'], self.s_next: batch['s_next'], self.done: batch['done'], self.episode_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) # One trajectory is complete! reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_reward = 0. lr_c *= config.lr_c_decay lr_a *= config.lr_a_decay if eps > config.epsilon_final: eps -= eps_drop if (reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0): # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}" .format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr_c, lr_a, eps, )) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): BufferRecord = namedtuple('BufferRecord', [ 's', 'a', 's_next', 'r', 'done', 'old_logp_actor', 'v_target', 'adv' ]) buffer = ReplayMemory(tuple_class=BufferRecord) reward_history = [] reward_averaged = [] step = 0 total_rec = 0 clip = config.ratio_clip_range if config.ratio_clip_decay: clip_delta = clip / config.n_iterations else: clip_delta = 0.0 for n_iteration in range(config.n_iterations): # we should have multiple rollout_workers running in parallel. for _ in range(config.n_rollout_workers): episode_reward, n_rec = self._generate_rollout(buffer) # One trajectory is complete. reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) total_rec += n_rec # now let's train the model for some steps. for batch in buffer.loop(config.batch_size, epoch=config.train_epoches): _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr_a: config.lr_a, self.lr_c: config.lr_c, self.clip_range: clip, self.s: batch['s'], self.a: batch['a'], self.s_next: batch['s_next'], self.r: batch['r'], self.done: batch['done'], self.old_logp_a: batch['old_logp_actor'], self.v_target: batch['v_target'], self.adv: batch['adv'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) step += 1 clip = max(0.0, clip - clip_delta) if (reward_history and config.log_every_iteration and n_iteration % config.log_every_iteration == 0): # Report the performance every `log_every_iteration` steps print( "[iteration:{}/step:{}], best:{}, avg:{:.2f}, hist:{}, clip:{:.2f}; {} transitions." .format( n_iteration, step, np.max(reward_history), np.mean(reward_history[-10:]), list(map(lambda x: round(x, 2), reward_history[-5:])), clip, total_rec)) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, config: TrainConfig): if self.model_type == 'lstm': buffer = ReplayTrajMemory(capacity=config.memory_capacity, step_size=self.step_size) else: buffer = ReplayMemory(capacity=config.memory_capacity) reward = 0. reward_history = [0.0] reward_averaged = [] lr = config.lr eps = config.epsilon annealing_episodes = config.warmup_episodes or config.n_episodes eps_drop = (config.epsilon - config.epsilon_final) / annealing_episodes print("eps_drop:", eps_drop) step = 0 for n_episode in range(config.n_episodes): ob = self.env.reset() done = False traj = [] while not done: a = self.act(self.obs_to_inputs(ob), eps) new_ob, r, done, info = self.env.step(a) step += 1 reward += r traj.append( Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(new_ob), done)) ob = new_ob # No enough samples in the buffer yet. if buffer.size < self.batch_size: continue # Training with a mini batch of samples! batch_data = buffer.sample(self.batch_size) feed_dict = { self.learning_rate: lr, self.states: batch_data['s'], self.actions: batch_data['a'], self.rewards: batch_data['r'], self.states_next: batch_data['s_next'], self.done_flags: batch_data['done'], self.ep_reward: reward_history[-1], } if self.double_q: actions_next = self.sess.run( self.actions_selected_by_q, {self.states: batch_data['s_next']}) feed_dict.update({self.actions_next: actions_next}) _, q_val, q_target_val, loss, summ_str = self.sess.run([ self.optimizer, self.q, self.q_target, self.loss, self.merged_summary ], feed_dict) self.writer.add_summary(summ_str, step) if step % config.target_update_every_step: self.update_target_q_net() # Add all the transitions of one trajectory into the replay memory. buffer.add(traj) # One episode is complete. reward_history.append(reward) reward_averaged.append(np.mean(reward_history[-10:])) reward = 0. # Annealing the learning and exploration rate after every episode. lr *= config.lr_decay if eps > config.epsilon_final: eps = max(eps - eps_drop, config.epsilon_final) if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0: # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}, eps:{:.4f}" .format(n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr, eps, buffer.size)) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, n_episodes, annealing_episodes=None, every_episode=None, done_rewards=None): step = 0 episode_reward = 0. reward_history = [] reward_averaged = [] lr_c = self.lr_c lr_a = self.lr_a eps = self.epsilon annealing_episodes = annealing_episodes or n_episodes eps_drop = (eps - self.epsilon_final) / annealing_episodes print("eps_drop:", eps_drop) for n_episode in range(n_episodes): ob = self.env.reset() self.act(ob, eps) done = False while not done: a = self.act(ob, eps) ob_next, r, done, info = self.env.step(a) step += 1 episode_reward += r if done: next_state_value = done_rewards or 0.0 else: with self.sess.as_default(): next_state_value = self.critic.eval( {self.states: [self.obs_to_inputs(ob_next)]})[0][0] td_target = r + self.gamma * next_state_value self.memory.add(Record(self.obs_to_inputs(ob), a, r, td_target)) ob = ob_next while self.memory.size >= self.batch_size: batch = self.memory.pop(self.batch_size) _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.learning_rate_c: lr_c, self.learning_rate_a: lr_a, self.states: batch['s'], self.actions: batch['a'], self.rewards: batch['r'], self.td_targets: batch['td_target'], self.ep_reward: reward_history[-1] if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) # One trajectory is complete! reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_reward = 0. lr_c *= self.lr_c_decay lr_a *= self.lr_a_decay if eps > self.epsilon_final: eps -= eps_drop if reward_history and every_episode and n_episode % every_episode == 0: # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}" .format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr_c, lr_a, eps, )) # self.save_model(step=step) self.save_model(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')
def train(self, n_episodes, every_episode): step = 0 episode_reward = 0. reward_history = [] reward_averaged = [] lr = self.lr for n_episode in range(n_episodes): ob = self.env.reset() done = False obs = [] actions = [] rewards = [] returns = [] while not done: a = self.act(ob) new_ob, r, done, info = self.env.step(a) step += 1 episode_reward += r obs.append(self.obs_to_inputs(ob)) actions.append(a) rewards.append(r) ob = new_ob # One trajectory is complete! reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) episode_reward = 0. lr *= self.lr_decay # Estimate returns backwards. return_so_far = 0.0 for r in rewards[::-1]: return_so_far = self.gamma * return_so_far + r returns.append(return_so_far) returns = returns[::-1] _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.learning_rate: lr, self.states: np.array(obs), self.actions: np.array(actions), self.returns: np.array(returns), self.ep_reward: reward_history[-1], }) self.writer.add_summary(summ_str, step) if reward_history and every_episode and n_episode % every_episode == 0: # Report the performance every `every_step` steps print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}". format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr, )) # self.save_model(step=step) self.save_model(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')