Exemplo n.º 1
0
    def _rollout_path(self, test):
        print("collecting sample")
        path = rl_path.RLPath()

        s = self._env.reset()
        s = np.array(s)
        path.states.append(s)

        done = False
        self._env._elapsed_steps = 0
        while (not done
               ) and self._env._elapsed_steps <= self._env._max_episode_steps:
            try:
                a, logp = self.sample_action(s, test)
            except Exception as e:
                print(e)
                continue

            s, r, done, info = self._step_env(a)
            s = np.array(s)

            path.states.append(s)
            path.actions.append(a)
            path.rewards.append(r)
            path.logps.append(logp)

            if (self.visualize):
                self.render_env()

            self._env._elapsed_steps += 1

        path.terminate = self._check_env_termination()

        return path
Exemplo n.º 2
0
    def _rollout_path(self, test, init_state=None):
        path = rl_path.RLPath()

        if init_state is None:
            s = self._env.reset()
        else:
            s = self._env.reset(init_state)

        s = np.array(s)
        path.states.append(s)

        done = False
        while not done:
            a, logp = self.sample_action(s, test)
            s, r, done, info = self._step_env(a)
            s = np.array(s)

            path.states.append(s)
            path.actions.append(a)
            path.rewards.append(r)
            path.logps.append(logp)

            if (self.visualize):
                self.render_env()

        path.terminate = self._check_env_termination()

        return path
Exemplo n.º 3
0
    def _load_path(self, railrl_path, obs_dict, obs_key="observation"):
        path = rl_path.RLPath()
        H = min(len(railrl_path["observations"]), len(railrl_path["actions"]))

        path.task_rewards = []

        for i in range(H):
            # self._env.update_
            ob = railrl_path["observations"][i]
            # self._env.update_obs(ob)
            # print(ob.keys())
            if obs_dict:
                s = ob[obs_key]
            else:
                s = ob
            # s = ob["state_observation"]
            path.states.append(s)

        for i in range(H - 1):
            a = railrl_path["actions"][i]
            r1 = float(railrl_path["rewards"][i])
            # r2 = self._env.compute_reward(a, railrl_path["observations"][i+1])
            path.actions.append(a)
            path.rewards.append(r1)
            # path.rewards.append(r2)
            path.logps.append(0.0)
            path.task_rewards.append(0)

        path.terminate = self._check_env_termination()

        return path
Exemplo n.º 4
0
    def _load_demo_data(self, env):
        episode_max_len = env._max_episode_steps
        max_samples = None
        demo_data = env.get_dataset()
        N = demo_data['rewards'].shape[0]
        print('loading from buffer. %d items loaded' % N)
        demo_obs = demo_data["observations"][:N - 1]
        demo_next_obs = demo_data["observations"][1:]
        #demo_next_obs = demo_data["next_observations"]
        demo_actions = demo_data["actions"][:N - 1]
        demo_rewards = demo_data["rewards"][:N - 1]
        demo_term = demo_data["terminals"][:N - 1]

        path = rl_path.RLPath()
        n = demo_obs.shape[0]
        total_return = 0.0
        num_paths = 0
        for i in range(n):
            curr_s = demo_obs[i]
            curr_a = demo_actions[i]
            curr_r = demo_rewards[i]
            curr_term = demo_term[i]
            #curr_g = np.array([])
            curr_logp = 0.0
            #curr_flags = self.EXP_ACTION_FLAG
            path.states.append(curr_s)
            #path.goals.append(curr_g)
            path.actions.append(curr_a)
            path.logps.append(curr_logp)
            path.rewards.append(curr_r)
            #path.flags.append(curr_flags)
            path_len = path.pathlength()
            done = (curr_term == 1) or (path_len == (episode_max_len - 1))
            if (done):
                next_s = demo_next_obs[i]
                #next_g = curr_g
                path.states.append(next_s)
                #path.goals.append(next_g)
                if path_len == (episode_max_len - 1):
                    path.terminate = rl_path.Terminate.Null
                else:
                    path.terminate = rl_path.Terminate.Fail
                self._replay_buffer.store(path)
                self._record_normalizers(path)
                curr_return = path.calc_return()
                total_return += curr_return
                num_paths += 1
                if i % 1000 == 0:
                    print("Loaded {:d}/{:d} samples".format(i, n))
                path.clear()
                if ((max_samples is not None) and (i >= max_samples)):
                    break
        self._update_normalizers()
        self._replay_buffer_initialized = True
        avg_return = total_return / num_paths
        print("Loaded {:d} samples, {:d} paths".format(i, num_paths))
        print("Avg demo return: {:.5f}".format(avg_return))
        return
Exemplo n.º 5
0
    def _rollout_path(
        self,
        test,
    ):
        path = rl_path.RLPath()

        s = self._env.reset()
        s = np.array(s)
        path.states.append(s)

        done = False
        t = 0
        infos = []
        while not done:
            a, logp = self.sample_action(s, test)
            s, r, done, info = self._step_env(a)
            s = np.array(s)

            path.states.append(s)
            path.actions.append(a)
            path.rewards.append(float(r))
            path.logps.append(logp)
            infos.append(info)

            if (self.visualize):
                self.render_env()

            t += 1
            if self._max_path_length and t >= self._max_path_length:
                break

        path.terminate = self._check_env_termination()

        path.task_rewards = []
        for info in infos:
            # import ipdb; ipdb.set_trace()
            # path.task_rewards.append(info["task_reward"])
            # path.task_rewards.append(info["goal_achieved"])
            path.task_rewards.append(0)  # info["goal_achieved"])

        return path
Exemplo n.º 6
0
def rollout_path(agent, action_std):
    path = rl_path.RLPath()

    s = agent._env.reset()
    s = np.array(s)
    path.states.append(s)

    done = False
    while not done:
        a = sample_action(agent, s, action_std)
        s, r, done, info = agent._step_env(a)
        s = np.array(s)

        path.states.append(s)
        path.actions.append(a)
        path.rewards.append(r)
        path.logps.append(0)

    path.terminate = agent._check_env_termination()

    print('HERE')
    return rl_path.RLPath2(path)  # in order to compute constraints