def make_subset_buffer(buffer_path, max_examples=100000, frame_height=40, frame_width=40): # keep max_examples < 100000 to enable knn search # states [top of image:bottom of image,:] # in breakout - can safely reduce size to be 80x80 of the given image # try to get an even number of each type of reward small_path = buffer_path.replace('.npz', '_%06d.npz' % max_examples) if os.path.exists(small_path): print('loading small buffer path') print(small_path) load_buffer = ReplayMemory(load_file=small_path) else: load_buffer = ReplayMemory(load_file=buffer_path) print('loading prescribed buffer path') print(buffer_path) # TODO if frame size is wrong - we arent handling if load_buffer.count > max_examples: print('creating small buffer') # actions for breakout: # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] sbuffer = ReplayMemory( max_examples, frame_height=frame_height, frame_width=frame_width, agent_history_length=load_buffer.agent_history_length) # remove ends because they are scary ends = np.where(load_buffer.terminal_flags == 1)[0][1:-1] random_state.shuffle(ends) for tidx in ends: if sbuffer.count >= max_examples: print('stopping after %s examples' % sbuffer.count) continue else: # start after the last terminal i = tidx + 1 # while there isnt a new terminal flag while not load_buffer.terminal_flags[i + 1]: frame = cv2.resize(load_buffer.frames[i][:, :, None], (frame_height, frame_width)) sbuffer.add_experience( action=load_buffer.actions[i], frame=frame, reward=load_buffer.rewards[i], terminal=load_buffer.terminal_flags[i]) i += 1 if not i % 100: print(sbuffer.count) sbuffer.save_buffer(small_path) load_buffer = sbuffer assert load_buffer.count > 10 return load_buffer, small_path
class StateManager(): def __init__(self): self.reward_space = [-1, 0, 1] self.latent_representation_function = None pass def create_new_state_instance(self, config_handler, phase): self.ch = config_handler self.save_time = time.time() - 100000 self.phase = phase self.step_number = 0 self.end_step_number = -1 self.episode_number = 0 self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase] self.random_state = np.random.RandomState(self.seed) self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble']) self.episodic_reward = [] self.episodic_reward_avg = [] self.episodic_step_count = [] self.episodic_step_ends = [] self.episodic_loss = [] self.episodic_times = [] self.episodic_eps = [] self.env = self.ch.create_environment(self.seed) self.memory_buffer = self.ch.load_memory_buffer(self.phase) # TODO should you load the count from the memory buffer - ? self.step_number = self.memory_buffer.count self.setup_eps() def setup_eps(self): if self.phase == 'train': self.eps_init = self.ch.cfg['DQN']['eps_init'] self.eps_final = self.ch.cfg['DQN']['eps_final'] self.eps_annealing_steps = self.ch.cfg['DQN'][ 'eps_annealing_steps'] self.last_annealing_step = self.eps_annealing_steps + self.ch.cfg[ 'DQN']['num_pure_random_steps_train'] if self.eps_annealing_steps > 0: self.slope = -(self.eps_init - self.eps_final) / self.eps_annealing_steps self.intercept = self.eps_init - self.slope * self.ch.cfg[ 'DQN']['num_pure_random_steps_train'] def load_checkpoint(self, filepath, config_handler=''): # load previously saved state file fh = open(filepath, 'rb') fdict = pickle.load(fh) fh.close() if config_handler != '': # use given config handler del fdict['ch'] self.ch = config_handler self.__dict__.update(fdict) self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble']) self.random_state = np.random.RandomState() self.random_state.set_state(fdict['state_random_state']) # TODO NOTE this does not restart at same env state self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase] self.env = self.ch.create_environment(self.seed) buffer_path = filepath.replace('.pkl', '.npz') self.memory_buffer = ReplayMemory(load_file=buffer_path) # TODO should you load the count from the memory buffer - ? # TODO what about episode number - it will be off now self.step_number = self.memory_buffer.count self.setup_eps() def save_checkpoint(self, checkpoint_basepath): # pass in step number because we always want to use training step number as reference self.save_time = time.time() self.plot_progress(checkpoint_basepath) # TODO save this class - except for random state i assume self.memory_buffer.save_buffer(checkpoint_basepath + '.npz') # TOO big - prob need to save specifics ## preserve random state - self.state_random_state = self.random_state.get_state() save_dict = { 'episodic_reward': self.episodic_reward, 'episodic_reward_avg': self.episodic_reward_avg, 'episodic_step_count': self.episodic_step_count, 'episodic_step_ends': self.episodic_step_ends, 'episodic_loss': self.episodic_loss, 'episodic_times': self.episodic_times, 'state_random_state': self.state_random_state, 'episode_number': self.episode_number, 'step_number': self.step_number, 'phase': self.phase, 'save_time': self.save_time, 'ch': self.ch, 'episodic_eps': self.episodic_eps, } fh = open(checkpoint_basepath + '.pkl', 'wb') pickle.dump(save_dict, fh) fh.close() print('finished pickle in', time.time() - self.save_time) def end_episode(self): # catalog self.end_time = time.time() self.end_step_number = deepcopy(self.step_number) # add to lists self.episodic_reward.append(np.sum(self.episode_rewards)) self.episodic_step_count.append(self.end_step_number - self.start_step_number) self.episodic_step_ends.append(self.end_step_number) self.episodic_loss.append(np.mean(self.episode_losses)) self.episodic_times.append(self.end_time - self.start_time) try: self.episodic_eps.append(self.eps) except: self.episodic_eps = [1.0 for x in range(len(self.episodic_times))] # smoothed reward over last 100 episodes self.episodic_reward_avg.append( np.mean( self. episodic_reward[-self.ch.cfg['PLOT']['num_prev_steps_avg']:])) num_steps = self.episodic_step_count[-1] print("*** %s E%05d S%010d AH%s-R%s num random/total steps:%s/%s***" % (self.phase, self.episode_number, self.step_number, self.active_head, self.episodic_reward[-1], self.num_random_steps, num_steps)) self.episode_active = False self.episode_number += 1 def start_episode(self): self.start_time = time.time() self.random_state.shuffle(self.heads) self.active_head = self.heads[0] self.end_step_number = -1 self.episode_losses = [] self.episode_actions = [] self.episode_rewards = [] self.start_step_number = deepcopy(self.step_number) self.num_random_steps = 0 # restart counters self.terminal = False self.life_lost = True self.episode_reward = 0 state = self.env.reset() self.prev_action = 0 self.prev_reward = 0 for i in range(state.shape[0] + 1): # add enough memories to use the memory buffer # not sure if this is correct self.memory_buffer.add_experience( action=0, frame=state[ -1], # use last frame in state bc it is only nonzero one reward=0, terminal=0, end=0, ) # get correctly formatted last state batch = self.memory_buffer.get_history_minibatch(indices='last') # get state self.state = batch[0][0] if self.state.shape != (self.ch.num_prev_steps, self.memory_buffer.agent_history_length, self.memory_buffer.frame_height, self.memory_buffer.frame_width): print("start shape wrong") embed() self.episode_active = True return self.state def plot_current_episode(self, plot_basepath=''): if plot_basepath == '': plot_basepath = self.get_plot_basepath() plot_dict = { 'mean loss': self.episode_losses, 'actions': self.episode_actions, 'rewards': self.episode_rewards, } suptitle = 'E%s S%s-%s R%s' % ( self.episode_number, self.start_step_number, self.end_step_number, self.episodic_reward[-1]) plot_path = plot_basepath + '_ep%06d.png' % self.episode_number #step_range = np.arange(self.start_step_number, self.end_step_number) #self.plot_data(plot_path, plot_dict, suptitle, xname='episode steps', xdata=step_range) self.plot_data(plot_path, plot_dict, suptitle, xname='episode steps') #, xdata=step_range) ep_steps = self.end_step_number - self.start_step_number self.plot_histogram(plot_basepath + '_ep_histrewards_%06d.png' % self.episode_number, data=self.episode_rewards, bins=self.reward_space, title='rewards TR%s' % self.episode_reward) self.plot_histogram( plot_basepath + '_ep_histactions_%06d.png' % self.episode_number, data=self.episode_actions, bins=self.env.action_space, title='actions acthead:%s nrand:%s/%s' % (self.active_head, self.num_random_steps, ep_steps)) def plot_last_episode(self): ep_steps = self.end_step_number - self.start_step_number ep_states, ep_actions, ep_rewards, ep_next_states, ep_terminals, ep_masks, indexes = self.memory_buffer.get_last_n_states( ep_steps) plot_basepath = self.get_plot_basepath() + '_episode_states_frames' self.plot_episode_movie(plot_basepath, ep_states, ep_actions, ep_rewards, ep_next_states, ep_terminals, ep_masks, indexes) def plot_episode_movie(self, plot_basepath, states, actions, rewards, next_states, terminals, masks, indexes): if not os.path.exists(plot_basepath): os.makedirs(plot_basepath) n_steps = states.shape[0] print('plotting episode of length %s' % n_steps) if self.latent_representation_function == None: n_cols = 2 else: pred_next_states, zs, latents = self.latent_representation_function( states, actions, rewards, self.ch) n_cols = 4 latent_image_path = os.path.join(plot_basepath, 'latent_step_%05d.png') ep_reward = sum(rewards) movie_path = plot_basepath + '_movie_R%04d.mp4' % ep_reward print('starting to make movie', movie_path) # write frame by frame then use ffmpeg to generate movie #image_path = os.path.join(plot_basepath, 'step_%05d.png') #w_path = plot_basepath+'_write_movie_R%04d.sh'%ep_reward #a = open(w_path, 'w') #cmd = "ffmpeg -n -r 30 -i %s -c:v libx264 -pix_fmt yuv420p %s"%(os.path.abspath(image_path),os.path.abspath(movie_path)) #a.write(cmd) #a.close() #w,h = states[0,3].shape #treward = 0 #for step in range(min(n_steps, 100)): # f, ax = plt.subplots(1, n_cols) # if not step%20: # print('plotting step', step) # ax[0].imshow(states[step, 3], cmap=plt.cm.gray) # #ax[0].set_title('OS-A%s' %(actions[step])) # ax[1].imshow(next_states[step, 3], cmap=plt.cm.gray) # treward+=rewards[step] # if self.latent_representation_function != None: # ax[2].imshow(pred_next_states[step], cmap=plt.cm.gray) # z = np.hstack((zs[step,0], zs[step,1], zs[step,2])) # ax[3].imshow(z) # for aa in range(n_cols): # ax[aa].set_xticks([]) # ax[aa].set_yticks([]) # f.suptitle('%sA%sR%sT%sD%s'%(step, actions[step], rewards[step], treward, int(terminals[step]))) # plt.savefig(image_path%step) # plt.close() # generate movie directly max_frames = 5000 n = min(n_steps, max_frames) for step in range(n): if self.latent_representation_function != None: z = np.hstack((zs[step, 0], zs[step, 1], zs[step, 2])) zo = resize(z, (84, 84), cval=0, order=0) # TODO - is imwrite clipping zo since it is not a uint8? img = np.hstack( (states[step, 3], next_states[step, 3], pred_next_states[step], zo)) else: img = np.hstack((states[step, 3], next_states[step, 3])) if not step: movie = np.zeros((n, img.shape[0], img.shape[1])) latent_movie = np.zeros((n, z.shape[0], z.shape[1])) movie[step] = img latent_movie[step] = z vwrite(movie_path, movie) def plot_histogram(self, plot_path, data, bins, title=''): n, bins, _ = plt.hist(data, bins=bins) plt.xticks(bins, bins) plt.yticks(n, n) plt.xlim(min(bins), max(bins) + 1) plt.title(title) plt.savefig(plot_path) plt.close() def plot_progress(self, plot_basepath=''): if plot_basepath == '': plot_basepath = self.get_plot_basepath() det_plot_dict = { 'episodic step count': self.episodic_step_count, 'episodic time': self.episodic_times, 'mean episodic loss': self.episodic_loss, 'eps': self.episodic_eps, } suptitle = 'Details E%s S%s' % (self.episode_number, self.end_step_number) edet_plot_path = plot_basepath + '_details_episodes.png' sdet_plot_path = plot_basepath + '_details_episodes.png' if self.end_step_number > 1: #exdata = np.arange(self.episode_number) #self.plot_data(edet_plot_path, det_plot_dict, suptitle, xname='episode', xdata=exdata) #self.plot_data(sdet_plot_path, det_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) self.plot_data(edet_plot_path, det_plot_dict, suptitle, xname='episode') #, xdata=exdata) self.plot_data(sdet_plot_path, det_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) rew_plot_dict = { 'episodic reward': self.episodic_reward, 'smooth episodic reward': self.episodic_reward_avg, } suptitle = 'Reward E%s S%s R%s' % (self.episode_number, self.end_step_number, self.episodic_reward[-1]) erew_plot_path = plot_basepath + '_reward_episodes.png' srew_plot_path = plot_basepath + '_reward_steps.png' #self.plot_data(erew_plot_path, rew_plot_dict, suptitle, xname='episode', xdata=np.arange(self.episode_number)) #self.plot_data(srew_plot_path, rew_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) self.plot_data( erew_plot_path, rew_plot_dict, suptitle, xname='episode') #, xdata=np.arange(self.episode_number)) self.plot_data(srew_plot_path, rew_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends) def plot_data(self, savepath, plot_dict, suptitle, xname, xdata=None): st = time.time() print('starting plot data') n = len(plot_dict.keys()) f, ax = plt.subplots(n, 1, figsize=(6, 3 * n)) #f,ax = plt.subplots(n,1) try: for xx, name in enumerate(sorted(plot_dict.keys())): if xdata is not None: ax[xx].plot(xdata, plot_dict[name]) else: ax[xx].plot(plot_dict[name]) ax[xx].set_title('%s' % (name)) ax[xx].set_ylabel(name) print(name, xname, st - time.time()) ax[xx].set_xlabel(xname) f.suptitle('%s %s' % (self.phase, suptitle)) print('end sup', st - time.time()) f.savefig(savepath) print("saved: %s" % savepath) plt.close() print('finished') except Exception: print("plot") embed() def get_plot_basepath(self): return self.ch.get_checkpoint_basepath( self.step_number) + '_%s' % self.phase def handle_plotting(self, plot_basepath='', force_plot=False): # will plot at beginning of episode #if not self.episode_number % self.ch.cfg['PLOT']['plot_episode_every_%s_episodes'%self.phase]: # dont plot first episode plot_basepath = self.get_plot_basepath() if self.episode_number: if force_plot: self.plot_current_episode(plot_basepath) self.plot_progress(plot_basepath) if self.episode_number == 1 or not self.episode_number % self.ch.cfg[ 'PLOT']['plot_episode_every_%s_episodes' % self.phase]: self.plot_current_episode(plot_basepath) if self.episode_number == 1 or not self.episode_number % self.ch.cfg[ 'PLOT']['plot_every_%s_episodes' % self.phase]: self.plot_progress(plot_basepath) def step(self, action): next_state, reward, self.life_lost, self.terminal = self.env.step( action) self.prev_action = action self.prev_reward = np.sign(reward) # the replay buffer will convert the observed state as needed self.memory_buffer.add_experience( action=action, frame=next_state[-1], reward=self.prev_reward, terminal=self.life_lost, end=self.terminal, ) self.episode_actions.append(self.prev_action) self.episode_rewards.append(self.prev_reward) self.step_number += 1 batch = self.memory_buffer.get_history_minibatch(indices='last') # get state self.state = batch[0][0] #self.state = self.memory_buffer.get_last_state() if self.state.shape[1] == 0: print('handler state chan 0') embed() def set_eps(self): # TODO function to find eps - for now use constant if self.step_number <= self.ch.cfg['DQN']['num_pure_random_steps_%s' % self.phase]: self.eps = 1.0 if self.phase == 'train': self.eps = self.eps_final if self.step_number < self.last_annealing_step: self.eps = self.slope * self.step_number + self.intercept else: self.eps = self.ch.cfg['EVAL']['eps_eval'] def random_action(self): self.num_random_steps += 1 # pass action_idx to env.action_space return self.random_state.choice(range(self.env.num_actions)) def is_random_action(self): self.set_eps() r = self.random_state.rand() if r < self.eps: return True else: return False
def make_random_subset_buffers(dataset_path, buffer_path, train_max_examples=100000, kernel_size=(2, 2), trim_before=0, trim_after=0): sys.path.append('../agents') from replay import ReplayMemory # keep max_examples < 100000 to enable knn search # states [top of image:bottom of image,:] # in breakout - can safely reduce size to be 40x40 of the given image # try to get an even number of each type of reward if not os.path.exists(dataset_path): os.makedirs(dataset_path) buffer_name = os.path.split(buffer_path)[1] buffers = {} paths = {} for phase in ['valid', 'train']: if phase == 'valid': max_examples = int(0.15 * train_max_examples) else: max_examples = train_max_examples small_name = buffer_name.replace( '.npz', '_random_subset_%06d_%sx%stb%sta%s_%s.npz' % (max_examples, kernel_size[0], kernel_size[1], trim_before, trim_after, phase)) small_path = os.path.join(dataset_path, small_name) paths[phase] = small_path if os.path.exists(small_path): print('loading small buffer path') print(small_path) sbuffer = ReplayMemory(load_file=small_path) sbuffer.init_unique() buffers[phase] = sbuffer # if we dont have both train and valid - make completely new train/valid set if not len(buffers.keys()) == 2: print('creating new train/valid buffers') load_buffer = ReplayMemory(load_file=buffer_path) orig_states = [] small_states = [] for index in range(10, 400): if load_buffer.is_valid_index(index): s, _ = load_buffer._get_state(index) orig_states.append(s[-1]) small_states.append( load_buffer.online_shrink_frame_size( s[-1], trim_before, kernel_size, trim_after)) bdir = small_path.replace('.npz', '') if not os.path.exists(bdir): os.makedirs(bdir) image_path = os.path.join(bdir, 'step_%03d.png') movie_path = os.path.join(bdir, 'movie.mp4') for index in range(len(orig_states)): f, ax = plt.subplots(1, 2) ax[0].matshow(orig_states[index]) ax[1].matshow(small_states[index]) plt.savefig(image_path % index) plt.close() cmd = "ffmpeg -n -r 10 -i %s -c:v libx264 -pix_fmt yuv420p %s" % ( os.path.abspath(image_path), os.path.abspath(movie_path)) print(cmd) os.system(cmd) if max(list(kernel_size) + [trim_before, trim_after]) > 1: load_buffer.shrink_frame_size(kernel_size=kernel_size, reduction_function=np.max, trim_before=trim_before, trim_after=trim_after) #for r in range(states.shape[0]): # imwrite('mp%s.png'%r, states[r,-1]) load_buffer.reset_unique() # history_length + 1 for every random example frame_multiplier = (load_buffer.agent_history_length + 1) #frame_multiplier = 2 total_frames_needed = int((max_examples * 1.15) * frame_multiplier) + 1 # not sure why we weren't allowing overlapping frames #total_frames_needed = int((max_examples*1.15)) if load_buffer.count < total_frames_needed % load_buffer.size: raise ValueError( 'load buffer is not large enough (%s) to collect number of examples (%s)' % (load_buffer.count, total_frames_needed)) print('loading prescribed buffer path.... this may take a while') print(buffer_path) for phase in ['valid', 'train']: if phase == 'valid': max_examples = int(0.15 * train_max_examples) else: max_examples = train_max_examples print('creating small %s buffer with %s examples' % (phase, max_examples)) # actions for breakout: # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] frames_needed = max_examples * frame_multiplier sbuffer = ReplayMemory( frames_needed, frame_height=load_buffer.frame_height, frame_width=load_buffer.frame_width, agent_history_length=load_buffer.agent_history_length) num_examples = 0 while num_examples < max_examples: batch = load_buffer.get_unique_minibatch(1) states, actions, rewards, next_states, real_terminal_flags, _, unique_indices, index_indices = batch bs, num_hist, h, w = states.shape # action is the action that was used to get from state to next state # t-3, t-2, t-1, t-1, t # s-4, s-3, s-2, s-1 # s-3, s-2, s-1, s past_indices = np.arange(unique_indices - (num_hist), unique_indices + 1) for batch_idx in range(bs): # get t-4 thru t=0 # size is bs,5,h,w all_states = np.hstack((states[:, 0:1], next_states)) for ss in range(num_hist + 1): # only use batch size 1 in minibatch # frame is "next state" in replay buffer frame = all_states[batch_idx, ss] action = load_buffer.actions[past_indices[ss]] reward = load_buffer.rewards[past_indices[ss]] if ss == num_hist: # this is the observed state and the only one we will # use a true action/reward for #action = actions[batch_idx] #reward = rewards[batch_idx] terminal_flag = True end_flag = True num_examples += 1 if not num_examples % 5000: print('added %s examples to %s buffer' % (num_examples, phase)) else: # use this to debug and assert that all actions/rewards # in sampled minibatch of sbuffer are < 99 terminal_flag = False end_flag = False sbuffer.add_experience(action, frame, reward, terminal_flag, end_flag) sbuffer.rewards = sbuffer.rewards.astype(np.int32) sbuffer.init_unique() sbuffer.save_buffer(paths[phase]) buffers[phase] = sbuffer return buffers, paths