def _rollout_path(self, test): print("collecting sample") path = rl_path.RLPath() s = self._env.reset() s = np.array(s) path.states.append(s) done = False self._env._elapsed_steps = 0 while (not done ) and self._env._elapsed_steps <= self._env._max_episode_steps: try: a, logp = self.sample_action(s, test) except Exception as e: print(e) continue s, r, done, info = self._step_env(a) s = np.array(s) path.states.append(s) path.actions.append(a) path.rewards.append(r) path.logps.append(logp) if (self.visualize): self.render_env() self._env._elapsed_steps += 1 path.terminate = self._check_env_termination() return path
def _rollout_path(self, test, init_state=None): path = rl_path.RLPath() if init_state is None: s = self._env.reset() else: s = self._env.reset(init_state) s = np.array(s) path.states.append(s) done = False while not done: a, logp = self.sample_action(s, test) s, r, done, info = self._step_env(a) s = np.array(s) path.states.append(s) path.actions.append(a) path.rewards.append(r) path.logps.append(logp) if (self.visualize): self.render_env() path.terminate = self._check_env_termination() return path
def _load_path(self, railrl_path, obs_dict, obs_key="observation"): path = rl_path.RLPath() H = min(len(railrl_path["observations"]), len(railrl_path["actions"])) path.task_rewards = [] for i in range(H): # self._env.update_ ob = railrl_path["observations"][i] # self._env.update_obs(ob) # print(ob.keys()) if obs_dict: s = ob[obs_key] else: s = ob # s = ob["state_observation"] path.states.append(s) for i in range(H - 1): a = railrl_path["actions"][i] r1 = float(railrl_path["rewards"][i]) # r2 = self._env.compute_reward(a, railrl_path["observations"][i+1]) path.actions.append(a) path.rewards.append(r1) # path.rewards.append(r2) path.logps.append(0.0) path.task_rewards.append(0) path.terminate = self._check_env_termination() return path
def _load_demo_data(self, env): episode_max_len = env._max_episode_steps max_samples = None demo_data = env.get_dataset() N = demo_data['rewards'].shape[0] print('loading from buffer. %d items loaded' % N) demo_obs = demo_data["observations"][:N - 1] demo_next_obs = demo_data["observations"][1:] #demo_next_obs = demo_data["next_observations"] demo_actions = demo_data["actions"][:N - 1] demo_rewards = demo_data["rewards"][:N - 1] demo_term = demo_data["terminals"][:N - 1] path = rl_path.RLPath() n = demo_obs.shape[0] total_return = 0.0 num_paths = 0 for i in range(n): curr_s = demo_obs[i] curr_a = demo_actions[i] curr_r = demo_rewards[i] curr_term = demo_term[i] #curr_g = np.array([]) curr_logp = 0.0 #curr_flags = self.EXP_ACTION_FLAG path.states.append(curr_s) #path.goals.append(curr_g) path.actions.append(curr_a) path.logps.append(curr_logp) path.rewards.append(curr_r) #path.flags.append(curr_flags) path_len = path.pathlength() done = (curr_term == 1) or (path_len == (episode_max_len - 1)) if (done): next_s = demo_next_obs[i] #next_g = curr_g path.states.append(next_s) #path.goals.append(next_g) if path_len == (episode_max_len - 1): path.terminate = rl_path.Terminate.Null else: path.terminate = rl_path.Terminate.Fail self._replay_buffer.store(path) self._record_normalizers(path) curr_return = path.calc_return() total_return += curr_return num_paths += 1 if i % 1000 == 0: print("Loaded {:d}/{:d} samples".format(i, n)) path.clear() if ((max_samples is not None) and (i >= max_samples)): break self._update_normalizers() self._replay_buffer_initialized = True avg_return = total_return / num_paths print("Loaded {:d} samples, {:d} paths".format(i, num_paths)) print("Avg demo return: {:.5f}".format(avg_return)) return
def _rollout_path( self, test, ): path = rl_path.RLPath() s = self._env.reset() s = np.array(s) path.states.append(s) done = False t = 0 infos = [] while not done: a, logp = self.sample_action(s, test) s, r, done, info = self._step_env(a) s = np.array(s) path.states.append(s) path.actions.append(a) path.rewards.append(float(r)) path.logps.append(logp) infos.append(info) if (self.visualize): self.render_env() t += 1 if self._max_path_length and t >= self._max_path_length: break path.terminate = self._check_env_termination() path.task_rewards = [] for info in infos: # import ipdb; ipdb.set_trace() # path.task_rewards.append(info["task_reward"]) # path.task_rewards.append(info["goal_achieved"]) path.task_rewards.append(0) # info["goal_achieved"]) return path
def rollout_path(agent, action_std): path = rl_path.RLPath() s = agent._env.reset() s = np.array(s) path.states.append(s) done = False while not done: a = sample_action(agent, s, action_std) s, r, done, info = agent._step_env(a) s = np.array(s) path.states.append(s) path.actions.append(a) path.rewards.append(r) path.logps.append(0) path.terminate = agent._check_env_termination() print('HERE') return rl_path.RLPath2(path) # in order to compute constraints