def generate_transitions(policy, env, num_timesteps_total, max_steps_per_episode, save_path): buff = SimpleReplayBuffer(num_timesteps_total, env.observation_space.shape, gym_get_dim(env.action_space), discrete_action_dim=True) cur_total = 0 steps_left_in_episode = 0 while cur_total != num_timesteps_total: if steps_left_in_episode == 0: steps_left_in_episode = max_steps_per_episode obs = env.reset() act = policy.get_action(obs) next_obs, rew, done, _ = env.step(act) buff.add_sample(obs, act, rew, done, next_obs) obs = next_obs cur_total += 1 steps_left_in_episode -= 1 save_dict = dict( observations=buff._observations, actions=buff._actions, rewards=buff._rewards, terminals=buff._terminals, next_observations=buff._next_obs, ) joblib.dump(save_dict, save_path) # debug from scipy.misc import imsave actions = buff._actions observations = buff._observations for i in range(1000): a = actions[i] obs = observations[i] print(a) imsave('junk_vis/tiny/mem_grid_{}.png'.format(i), np.transpose(obs, (1, 2, 0)))
def test_num_steps_can_sample(self): buffer = SimpleReplayBuffer(10000, 1, 1) buffer.add_sample(1, 1, 1, False, 1) buffer.add_sample(1, 1, 1, True, 1) buffer.terminate_episode() buffer.add_sample(1, 1, 1, False, 1) self.assertEqual(buffer.num_steps_can_sample(), 3)
obs = d['obs'][path_num] acs = d['acs'][path_num] env_infos = d['info'][path_num] ep_len = len(obs) for j in range(ep_len - 1): o = { 'obs': obs[j]['observation'], 'obs_task_params': obs[j]['desired_goal'] } a = acs[j] r = 0. # the demons don't come with reward terminal = 0 # none of the robotic environments in gym have terminal 1 ever next_o = { 'obs': obs[j + 1]['observation'], 'obs_task_params': obs[j + 1]['desired_goal'] } env_info = env_infos[j] buffer.add_sample(o, a, r, terminal, next_o, agent_info={}, env_info=env_info) buffer.terminate_episode() # save it file_name = os.path.join(rlkit_buffer_save_dir, 'extra_data.pkl') joblib.dump({'replay_buffer': buffer}, file_name, compress=3)