def generate_cam_video(self, sess, time_per_step, global_t, folder, demo_memory_cam, demo_cam_human=False): # use one demonstration data to record cam # only need to make movie for demo data once cam_side_img = self.generate_cam(sess, demo_memory_cam, global_t) path = '/frames/demo-cam_side_img' if demo_cam_human: path += '_human' make_movie(cam_side_img, folder + '{}{ep:010d}'.format(path, ep=(global_t)), duration=len(cam_side_img) * time_per_step, true_image=True, salience=False) del cam_side_img
def run(self, minutes_limit=5, episode=0, num_episodes=0, demo_type=0, model_net=None, replay_memory=None, total_memory=0): if self.create_movie: movie_images = [] rewards = {'train':[], 'eval':[]} full_episode = False if minutes_limit == 0: full_episode = True timeout = 60 * minutes_limit t = 0 total_reward = 0.0 # re-initialize game for evaluation self._reset(replay_memory, hard_reset=True) rew = self.game_state.reward terminal = False lives = self.game_state.lives # loss_life = self.game_state.loss_life # gain_life = self.game_state.gain_life and not loss_life if self.pause_onstart: root = Tk() root.withdraw() messagebox.showinfo( self.name, "Start episode {} of {}. total memory={}. " "Press OK to start playing".format(episode, num_episodes, total_memory)) # regular game start_time = datetime.datetime.now() timeout_start = time.time() actions = deque() dtm = time.time() pulse = 1.0 / self.hertz while True: dtm += pulse delay = dtm - time.time() if delay > 0: time.sleep(delay) #60 hz else: dtm = time.time() if not terminal: if demo_type == 1: # RANDOM AGENT action = np.random.randint(self.game_state.n_actions) elif demo_type == 2: # MODEL AGENT if sub_t % self._skip == 0: self._update_state_input(self.game_state.s_t) readout_t = model_net.evaluate(self.state_input)[0] action = get_action_index(readout_t, is_random=False, n_actions=self.game_state.n_actions) else: # HUMAN action = self.game_state.env.human_agent_action actions.append(action) self.game_state.step(action) rew += self.game_state.reward lives = self.game_state.lives # loss_life = loss_life or self.game_state.loss_life # gain_life = (gain_life or self.game_state.gain_life) and not loss_life total_reward += self.game_state.reward t += 1 if self.create_movie: movie_images.append(self.game_state.get_screen_rgb()) # Ensure that D does not reach max memory that mitigate # problems when combining different human demo files if (replay_memory.size + 3) == replay_memory.max_steps: logger.warn("Memory max limit reached!") terminal = True elif not full_episode: terminal = True if (time.time() > timeout_start + timeout) else False # add memory every 4th frame even if demo uses skip=1 if self.game_state.get_episode_frame_number() % self._skip == 0 or terminal or self.game_state.terminal: self.obs_buffer[0] = self.game_state.x_t self.obs_buffer[1] = self.game_state.x_t1 max_obs = self.obs_buffer.max(axis=0) # cv2.imshow('max obs', max_obs) # cv2.imshow('current', self.game_state.x_t1) # cv2.waitKey(1) # store the transition in D replay_memory.add( max_obs, actions.popleft(), rew, terminal or self.game_state.terminal, lives, fullstate=self.game_state.full_state1) actions.clear() rew = 0 if terminal or (self.game_state.episode_life and get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done): root = Tk() root.withdraw() messagebox.showinfo(self.name, "Times up!" if terminal else "Game ended!") break if self.game_state.terminal: self._reset(replay_memory, hard_reset=False) continue self.game_state.update() end_time = datetime.datetime.now() duration = end_time - start_time logger.info("Duration: {}".format(duration)) logger.info("Total steps: {}".format(t)) logger.info("Total reward: {}".format(total_reward)) logger.info("Total Replay memory saved: {}".format(replay_memory.size)) replay_memory.save(name=self.name, folder=self.folder, resize=True) if self.create_movie: time_per_step = 0.0167 make_movie( movie_images, str(self.folder / "demo"), duration=len(movie_images)*time_per_step, true_image=True, salience=False) return total_reward, t, start_time, end_time, duration, replay_memory.size
def testing(self, sess, max_steps, global_t, folder, worker=None): """Evaluate A3C.""" assert worker is not None assert not worker.is_refresh_thread assert not worker.is_sil_thread logger.info("Evaluate policy at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(worker.sync) episode_buffer = [] worker.game_state.reset(hard_reset=True) episode_buffer.append(worker.game_state.get_screen_rgb()) total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: state = cv2.resize(worker.game_state.s_t, worker.local_net.in_shape[:-1], interpolation=cv2.INTER_AREA) pi_, value_, logits_ = \ worker.local_net.run_policy_and_value(sess, state) if False: action = np.random.choice(range(worker.action_size), p=pi_) else: action = worker.pick_action(logits_) # take action worker.game_state.step(action) terminal = worker.game_state.terminal if n_episodes == 0 and global_t % 5000000 == 0: episode_buffer.append(worker.game_state.get_screen_rgb()) episode_reward += worker.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 worker.game_state.update() if terminal: env = worker.game_state.env name = 'EpisodicLifeEnv' if get_wrapper_by_name(env, name).was_real_done: # make a video every 5M training steps, using the first episode tested if n_episodes == 0 and global_t % 5000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) file = 'frames/image{ep:010d}'.format(ep=global_t) duration = len(images) * time_per_step make_movie(images, str(folder / file), duration=duration, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "yellow") steps_str = colored("steps={}".format(episode_steps), "cyan") log_data = (global_t, worker.thread_idx, self.thread_idx, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} test_worker={} cur_worker={}" " trial={} {} {}" " total_steps={}".format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 worker.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (global_t, worker.thread_idx, self.thread_idx, total_reward, total_steps, n_episodes) logger.info("test: global_t={} test_worker={} cur_worker={}" " final score={} final steps={}" " # trials={}".format(*log_data)) worker.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=global_t, mode='A3C_Test') # reset variables used in training worker.episode_reward = 0 worker.episode_steps = 0 worker.game_state.reset(hard_reset=True) worker.last_rho = 0. if worker.use_sil: # ensure no states left from a non-terminating episode worker.episode.reset() return (total_reward, total_steps, n_episodes)
def testing(self, sess, max_steps, global_t, folder, demo_memory_cam=None): logger.info("Evaluate policy at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None and global_t % 5000000 == 0: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam) episode_buffer = [] self.game_state.reset(hard_reset=True) episode_buffer.append(self.game_state.get_screen_rgb()) total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 while max_steps > 0: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) if False: action = np.random.choice(range(self.action_size), p=pi_) else: action = self.choose_action(logits_) if self.use_pretrained_model_as_advice: psi = self.psi if self.psi > 0.001 else 0.0 if psi > np.random.rand(): model_pi = self.pretrained_model.run_policy( self.pretrained_model_sess, self.game_state.s_t) model_action, confidence = self.choose_action_with_high_confidence( model_pi, exclude_noop=False) if model_action > self.shaping_actions and confidence >= self.advice_confidence: action = model_action # take action self.game_state.step(action) terminal = self.game_state.terminal if n_episodes == 0 and global_t % 5000000 == 0: episode_buffer.append(self.game_state.get_screen_rgb()) episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: if n_episodes == 0 and global_t % 5000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) make_movie( images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (global_t, self.thread_index, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} worker={} trial={} {} {} total_steps={}" .format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (global_t, self.thread_index, total_reward, total_steps, n_episodes) logger.info( "test: global_t={} worker={} final score={} final steps={} # trials={}" .format(*log_data)) self.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=global_t, mode='Test') # reset variables used in training self.episode_reward = 0 self.episode_steps = 0 self.game_state.reset(hard_reset=True) self.last_rho = 0. if self.is_demo_thread: self.replay_mem_reset() if self.use_lstm: self.local_network.reset_state() return total_reward, total_steps, n_episodes
def testing_model(self, sess, max_steps, global_t, folder, demo_memory_cam=None, demo_cam_human=False): logger.info("Testing model at global_t={}...".format(global_t)) # copy weights from shared to local sess.run(self.sync) if demo_memory_cam is not None: self.generate_cam_video(sess, 0.03, global_t, folder, demo_memory_cam, demo_cam_human) return else: self.game_state.reset(hard_reset=True) max_steps += 4 test_memory = ReplayMemory( 84, 84, np.random.RandomState(), max_steps=max_steps, phi_length=4, num_actions=self.game_state.env.action_space.n, wrap_memory=False, full_state_size=self.game_state.clone_full_state().shape[0]) for _ in range(4): test_memory.add(self.game_state.x_t, 0, self.game_state.reward, self.game_state.terminal, self.game_state.lives, fullstate=self.game_state.full_state) episode_buffer = [] test_memory_cam = [] total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 terminal = False while True: #pi_ = self.local_network.run_policy(sess, self.game_state.s_t) test_memory_cam.append(self.game_state.s_t) episode_buffer.append(self.game_state.get_screen_rgb()) pi_, value_, logits_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) #action = self.choose_action(logits_) action = np.argmax(pi_) # take action self.game_state.step(action) terminal = self.game_state.terminal memory_full = episode_steps == max_steps - 5 terminal_ = terminal or memory_full # store the transition to replay memory test_memory.add(self.game_state.x_t1, action, self.game_state.reward, terminal_, self.game_state.lives, fullstate=self.game_state.full_state1) # update the old values episode_reward += self.game_state.reward episode_steps += 1 # s_t = s_t1 self.game_state.update() if terminal_: if get_wrapper_by_name( self.game_state.env, 'EpisodicLifeEnv').was_real_done or memory_full: time_per_step = 0.03 images = np.array(episode_buffer) make_movie(images, folder + '/frames/image{ep:010d}'.format(ep=global_t), duration=len(images) * time_per_step, true_image=True, salience=False) break self.game_state.reset(hard_reset=False) if self.use_lstm: self.local_network.reset_state() total_reward = episode_reward total_steps = episode_steps log_data = (global_t, self.thread_index, total_reward, total_steps) logger.info( "test: global_t={} worker={} final score={} final steps={}".format( *log_data)) self.generate_cam_video(sess, 0.03, global_t, folder, np.array(test_memory_cam)) test_memory.save(name='test_cam', folder=folder, resize=True) if self.use_lstm: self.local_network.reset_state() return
def test(self, render=False): logger.info("Evaluate policy at global_t={}...".format(self.global_t)) episode_buffer = [] self.game_state.reset(hard_reset=True) episode_buffer.append(self.game_state.get_screen_rgb()) max_steps = self.eval_max_steps total_reward = 0 total_steps = 0 episode_reward = 0 episode_steps = 0 n_episodes = 0 # use one demonstration data to record cam # only need to make movie for demo data once # if self.global_t == 0: cam, state, action = self.calculate_cam(self.test_cam_si) cam_plus_img = [] cam_side_img = [] for i in range(len(cam)): # overlay cam-state overlay = np.uint8(cam[i]).copy() output = np.uint8(state[i]).copy() alpha = 0.3 cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output) # create a title space for action title_space = np.zeros((20, 84, 3), np.uint8) title_space[:] = (255, 255, 255) cv2.putText(title_space, "{}".format(ACTION_MEANING[action[i]]), (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1) # concate title and state vcat_output = cv2.vconcat((title_space, output)) cam_plus_img.append(vcat_output) # side-by-side cam-state hcat_cam_state = cv2.hconcat( (np.uint8(cam[i]).copy(), np.uint8(state[i]).copy())) title_space = np.zeros((20, 84 * 2, 3), np.uint8) title_space[:] = (255, 255, 255) vcat_title_camstate = cv2.vconcat((title_space, hcat_cam_state)) cv2.putText(vcat_title_camstate, "{}".format(ACTION_MEANING[action[i]]), (20, 14), cv2.FONT_HERSHEY_DUPLEX, .4, (0, 0, 0), 1) cam_side_img.append(vcat_title_camstate) time_per_step = 0.0167 make_movie( cam_plus_img, self.folder + '/frames/demo-cam_plus_img{ep:010d}'.format(ep=(self.global_t)), duration=len(cam) * time_per_step, true_image=True, salience=False) make_movie( cam_side_img, self.folder + '/frames/demo-cam_side_img{ep:010d}'.format(ep=(self.global_t)), duration=len(state) * time_per_step, true_image=True, salience=False) del cam, state, action, cam_plus_img, cam_side_img while max_steps > 0: readout_t = self.net.evaluate(self.game_state.s_t)[0] action = get_action_index( readout_t, is_random=(random.random() <= 0.05), n_actions=self.game_state.env.action_space.n) # take action self.game_state.step(action) terminal = self.game_state.terminal if n_episodes == 0 and self.global_t % 2000000 == 0: episode_buffer.append(self.game_state.get_screen_rgb()) episode_reward += self.game_state.reward episode_steps += 1 max_steps -= 1 # s_t = s_t1 self.game_state.update() if terminal: if get_wrapper_by_name(self.game_state.env, 'EpisodicLifeEnv').was_real_done: if n_episodes == 0 and self.global_t % 2000000 == 0: time_per_step = 0.0167 images = np.array(episode_buffer) make_movie(images, self.folder + '/frames/image{ep:010d}'.format( ep=(self.global_t)), duration=len(images) * time_per_step, true_image=True, salience=False) episode_buffer = [] n_episodes += 1 score_str = colored("score={}".format(episode_reward), "magenta") steps_str = colored("steps={}".format(episode_steps), "blue") log_data = (self.global_t, n_episodes, score_str, steps_str, total_steps) logger.debug( "test: global_t={} trial={} {} {} total_steps={}". format(*log_data)) total_reward += episode_reward total_steps += episode_steps episode_reward = 0 episode_steps = 0 self.game_state.reset(hard_reset=False) if n_episodes == 0: total_reward = episode_reward total_steps = episode_steps else: # (timestep, total sum of rewards, total # of steps before terminating) total_reward = total_reward / n_episodes total_steps = total_steps // n_episodes log_data = (self.global_t, total_reward, total_steps, n_episodes) logger.debug( "test: global_t={} final score={} final steps={} # episodes={}". format(*log_data)) self.net.record_summary(score=total_reward, steps=total_steps, episodes=n_episodes, global_t=self.global_t, mode='Test') self.rewards['eval'][self.global_t] = (total_reward, total_steps, n_episodes) return total_reward, total_steps, n_episodes