def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env env.state.is_render_image = self.config.render_test # replay memory to play replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] for i in range(num_episodes): total_reward = 0 state = env.reset() goal_state = env.teacher.goal_obs_onehot_state # h x w x c h_state = (np.zeros([1, self.config.h_size]), np.zeros([1, self.config.h_size])) slen = np.ones(1).astype('int32') action = 0 for j in range(50): if self.config.render_test: env.render() #### for replay_buffer # store last state in buffer idx = replay_buffer.store_frame(state, goal_state) q_input = replay_buffer.encode_recent_observation() action, action_q, h_state = self.get_action( [q_input], goal_state[None][None], h_state, slen, [action]) #print(action, action_q) # perform action in env new_state, reward, done = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) return avg_reward
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() # h x w x c goal_state = self.env.teacher.goal_obs_onehot_state # h x w x c h_state = (np.zeros([1, self.config.h_size]), np.zeros([1, self.config.h_size])) slen = np.ones(1).astype('int32') action = 0 for i in range(200): t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() #### for replay_buffer # replay memory stuff idx = replay_buffer.store_frame(state, goal_state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values, h_state = self.get_best_action( [q_input], goal_state[None][None], h_state, slen, [action]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate()] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)