Python GridWorldContinuous.step示例

编程语言: Python

命名空间/包名称: lib.environments.gridworld

方法/功能: step

hotexamples.com的示例: 5

Python GridWorldContinuous.step - 已找到5个示例。这些是从开源项目中提取的最受好评的lib.environments.gridworld.GridWorldContinuous.step现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

lls2hls(5)

reset(5)

reset_env_terrain(5)

sample_random_goal_terrain(5)

sample_random_start_terrain(5)

step(5)

GridWorldContinuous(4)

示例#1

显示文件

class TRPOTrainer:
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None):
        self.config = config
        self.save_path = save_path
        self.grid = GridWorldContinuous(grid_mode='custom',
                                        layout=np.load('maze1_layout.npy'),
                                        objects=np.load('maze1_objects.npy'),
                                        terrain=self.config.terrain_var)
        self.grid.goal = (13., 13.)
        self.grid.reset_env_terrain()

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)
        self.buffer = Buffer()
        self.vi = ValueIteration4M1SyncNN8(grid_size_x=21,
                                           grid_size_y=21,
                                           gamma=self.optimizer.discount)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0

    def bound_x(self, input_var):
        return max(0, min(input_var, self.grid.x_size - 1))

    def bound_y(self, input_var):
        return max(0, min(input_var, self.grid.y_size - 1))

    def roll_out_in_env(self,
                        start,
                        goal,
                        ultimate_goal,
                        horizon,
                        mode='test'):
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        roll_out = Batch()
        break_var = False
        s = start
        d = False

        s_list = []
        a_list = []
        r_list = []

        for step_i in range(horizon):
            self.episode_steps += 1
            s_bar = self.grid.lls2hls(s)

            # GET THE TARGET VECTOR
            target_vec = (goal[0] - s[0], goal[1] - s[1])
            s_save = copy.deepcopy(
                np.array(
                    list(s[2:]) +
                    list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, _, info = self.grid.step(action=10 * a)
            s_bar_cand = self.grid.lls2hls(s_new)

            d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1])
            if d:
                info = False
                r = 1.0

            success_var = ((s_bar_cand[0] == s_u_goal[0])
                           and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (
                self.episode_steps == self.config.time_horizon)

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new.astype(np.float32)], [0 if break_var else 1],
                      [info], [1.0]))
            s = s_new
            if break_var:
                break

        return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos, goal_pos)
                self.episode_steps = 0
                """ V MAP """
                if self.config.optimistic_model:
                    self.vi.update_p_table_optimistic(
                        occupancy_map=self.grid.occupancy_map, walls=True)
                else:
                    self.vi.update_p_table(occupancy_map=self.grid.terrain_map,
                                           walls=True)

                v, pi = self.vi.run_vi(grid=self.grid,
                                       goal=(s_goal[0], s_goal[1]))
                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                st = start_pos
                success = False

                s_bar = self.grid.lls2hls(st)
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(
                        -1. * self.dqn_steps / 10000)

                    s_bar = self.grid.lls2hls(st)

                    if torch.rand(1)[0] > self.eps:
                        a_bar = int(pi[s_bar[0], s_bar[1]])
                    else:
                        a_bar = randint(0, 7)

                    self.vi.set_target(s_bar, a_bar)
                    curr_goal = self.vi.get_target()

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        horizon=self.time_scale,
                        ultimate_goal=goal_pos,
                        mode='train')

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(a_bar)

                    st = l_state[:2]
                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))
                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                total_success += success
                j += 1

                if not self.config.optimistic_model:
                    x_temp, y_temp, w_temp = self.vi.generate_dataset_flat(
                        self.grid.terrain_map, hl_s_list, hl_a_list)

                    for bi in range(x_temp.shape[0]):
                        self.buffer.add(x_temp[bi], y_temp[bi], w_temp[bi])

                    self.vi.train_net(buffer=self.buffer,
                                      bs=128,
                                      opt_iterations=40,
                                      rw=True)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos, goal_pos)
            self.episode_steps = 0
            """ V MAP """
            if self.config.optimistic_model:
                self.vi.update_p_table_optimistic(
                    occupancy_map=self.grid.occupancy_map, walls=True)
            else:
                self.vi.update_p_table(occupancy_map=self.grid.terrain_map,
                                       walls=True)

            v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1]))

            horizon_left = self.config.time_horizon
            st = start_pos
            success = False

            s_bar = self.grid.lls2hls(st)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                a_bar = int(pi[s_bar[0], s_bar[1]])
                self.vi.set_target(s_bar, a_bar)
                curr_goal = self.vi.get_target()

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    horizon=self.time_scale,
                    ultimate_goal=goal_pos,
                    mode='test')

                st = l_state[:2]
                s_bar = self.grid.lls2hls(st)
                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

            return success

    def train(self):
        for iter_loop in range(self.config.num_inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)
            print(self.buffer.size)
            print(self.eps)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success

示例#2

显示文件

文件： trpo_4_rooms.py 项目： boschresearch/Hierarchies-of-Planning-and-Reinforcement-Learning-for-Robot-Navigation

class TRPOTrainer:

    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 use_fim=False,
                 use_gpu=False,
                 terrain_var=False
                 ):

        self.grid = GridWorldContinuous(grid_mode='standard', terrain=terrain_var)
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim

    def roll_out_in_env(self, start, goal, ultimate_goal, horizon):
        roll_out = Batch()
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        s = self.grid.reset(start, goal)
        s_list = []
        a_list = []
        r_list = []
        d = False

        for step_i in range(horizon):
            s_bar = self.grid.lls2hls(s)
            target_vec = goal - s[:2]
            s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                                            list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, d_wp, info = self.grid.step(action=10*a)

            s_bar_cand = self.grid.lls2hls(s_new)

            break_var = False
            success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1]))

            if success_var:
                info = False
                break_var = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new.astype(np.float32)],
                      [0 if ((not info) or (step_i + 1 == horizon) or break_var) else 1],
                      [info],
                      [1.0]))
            s = s_new
            if (not info) or break_var:
                break
        return roll_out, s_list, a_list, r_list, d, s_new

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:

                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)

                roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos,
                                                                                            goal=goal_pos,
                                                                                            ultimate_goal=goal_pos,
                                                                                            horizon=self.max_iter)

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

                jwp = 1.
                num_roll_outs += 1
                num_steps += roll_out.length()
                batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)

            roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos,
                                                                                        goal=goal_pos,
                                                                                        ultimate_goal=goal_pos,
                                                                                        horizon=self.max_iter)

            st_bar = self.grid.lls2hls(l_state)
            success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

            num_roll_outs += 1
            num_steps += roll_out.length()
            batch.append(roll_out)

            total_success += success
            j += 1

            return success

    def train(self):
        for iter_loop in range(self.inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success

示例#3

显示文件

文件： hiro_4_rooms.py 项目： boschresearch/Hierarchies-of-Planning-and-Reinforcement-Learning-for-Robot-Navigation

class HIROTrainer:
    def __init__(self, iter_num, iter_size, state_dim, action_dim, max_action,
                 batch_size, discount, tau, expl_noise, policy_noise,
                 noise_clip, policy_freq, max_iter, start_time_steps,
                 total_iter, terrain_var, her_var):

        self.iter_num = iter_num
        self.iter_size = iter_size

        kwargs = {
            "state_dim": state_dim,
            "action_dim": action_dim,
            "max_action": max_action,
            "discount": discount,
            "tau": tau,
            "policy_noise": policy_noise * max_action,
            "noise_clip": noise_clip * max_action,
            "policy_freq": policy_freq
        }

        self.action_dim = action_dim

        self.grid = GridWorldContinuous(grid_mode='standard',
                                        terrain=terrain_var)
        self.grid.reset_env_terrain()

        self.manager_policy = TD3(**kwargs)
        self.replay_buffer = utils.ReplayBufferHIRO(state_dim,
                                                    action_dim,
                                                    max_size=int(2e5))

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=[64, 64, 64],
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=5e-4,
                              damping=5e-3,
                              use_fim=False,
                              discount=0.99,
                              imp_weight=False)

        self.manager = Manager()

        self.batch_size = batch_size
        self.expl_noise = expl_noise
        self.expl_noise_start = expl_noise
        self.max_action = max_action

        self.total_steps = 0
        self.max_iter = max_iter
        self.start_time_steps = start_time_steps

        self.manager_time_scale = 2

        self.current_iter = 0
        self.total_iter = total_iter

        self.her_var = her_var

    def roll_out_in_env(self, start, goal, horizon, mode='train'):
        roll_out = Batch()
        done = False
        s = self.grid.reset(start, goal)
        s_list = []
        a_list = []
        r_list = []

        state_seq = []
        action_seq = []

        s_bar = self.grid.lls2hls(s)
        s_manager = copy.deepcopy(
            np.array(
                list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                list(goal)))
        r_manager = 0.
        if mode == 'train':
            if self.total_steps < self.start_time_steps:
                sub_goal = (4 * np.random.random((2, ))) - 2
            else:
                sub_goal = (self.manager_policy.select_action(s_manager) +
                            np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\
                    clip(-self.max_action, self.max_action)
        else:
            sub_goal = self.manager_policy.select_action(s_manager)

        self.manager.update(s=copy.deepcopy(s), sg=sub_goal)

        s_goal = self.grid.lls2hls(goal)

        for step_i in range(horizon):

            if mode == 'train':
                self.total_steps += 1

            s_bar = self.grid.lls2hls(s)
            s_save = copy.deepcopy(
                np.array(
                    list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(self.manager.target(s))))
            s_list.append(s_save)

            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            state_seq.append(s_save)
            action_seq.append(a)

            s_new, _, _, _, _ = self.grid.step(action=10 * a)
            r = self.manager.reward(s_new)
            a_list.append(a)
            r_list.append(r)

            s_new_bar = self.grid.lls2hls(s_new)
            done = ((s_new_bar[0] == s_goal[0])
                    and (s_new_bar[1] == s_goal[1]))
            r_manager += -1 * float(not done)

            manager_update = (step_i + 1) % self.manager_time_scale == 0

            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [
                    r
                ], [s_new.astype(np.float32)], [
                    0 if
                    ((step_i + 1 == horizon) or done or manager_update) else 1
                ], [not done], [1.0]))

            s = s_new

            if manager_update or done:
                self.total_steps += 1
                s_new_manager = copy.deepcopy(
                    np.array(
                        list(s) + list(self.grid.state_cache[s_new_bar[0],
                                                             s_new_bar[1]]) +
                        list(goal)))
                if mode == 'train':
                    self.replay_buffer.add(s_manager, self.manager.sg,
                                           s_new_manager, r_manager, done,
                                           np.array(state_seq),
                                           np.array(action_seq))
                    if self.her_var:
                        s_manager_her = np.concatenate((s_manager[:12], s[:2]))
                        s_new_manager_her = np.concatenate(
                            (s_new_manager[:12], s[:2]))
                        r_manager_her = -len(state_seq) + 1
                        self.replay_buffer.add(s_manager_her, self.manager.sg,
                                               s_new_manager_her,
                                               r_manager_her, True,
                                               np.array(state_seq),
                                               np.array(action_seq))

                    state_seq = []
                    action_seq = []

                s_manager = s_new_manager
                r_manager = 0.

                if mode == 'train':
                    if self.total_steps < self.start_time_steps:
                        sub_goal = (4 * np.random.random((2, ))) - 2
                    else:
                        sub_goal = (self.manager_policy.select_action(s_manager) +
                                    np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \
                            clip(-self.max_action, self.max_action)
                else:
                    sub_goal = self.manager_policy.select_action(s_manager)

                self.manager.update(s=copy.deepcopy(s), sg=sub_goal)

            if done:
                break
        s_save = copy.deepcopy(
            np.array(
                list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                list(self.manager.target(s))))
        s_list.append(s_save)
        return roll_out, step_i + 1, s_list, a_list, r_list, done

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0

        if mode == 'train':
            while num_steps < self.iter_size:
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]

                roll_out, steps, states, actions, rewards, success = self.roll_out_in_env(
                    start=start_pos,
                    goal=goal_pos,
                    horizon=self.max_iter,
                    mode='train')

                if self.total_steps > self.start_time_steps:
                    for _ in range(40):
                        self.manager_policy.train(self.replay_buffer,
                                                  self.batch_size)

                batch.append(roll_out)
                num_roll_outs += 1
                num_steps += steps

                total_success += success

            return batch, total_success / num_roll_outs, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]

            _, steps, states, actions, rewards, success = self.roll_out_in_env(
                start=start_pos,
                goal=goal_pos,
                horizon=self.max_iter,
                mode='test')

        return success

    def train(self):
        for iter_loop in range(self.iter_num):
            self.current_iter += 1
            self.expl_noise = max((1 - (self.current_iter / self.total_iter)) *
                                  self.expl_noise_start, 0)
            batch, g_success, steps_trajectory = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("OC")
            print(self.replay_buffer.size)
            start_t = time.time()
            self.replay_buffer.off_policy_correction(policy=self.policy)
            print(time.time() - start_t)
            print("STATS")
            print(g_success)
            print(steps_trajectory)

    def test(self):
        success = self.simulate_env(mode='test')
        return success

示例#4

显示文件

文件： trpo_4_rooms_rrt_wp.py 项目： boschresearch/Hierarchies-of-Planning-and-Reinforcement-Learning-for-Robot-Navigation

class TRPOTrainer:

    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None
                 ):
        self.config = config
        self.save_path = save_path
        self.grid = GridWorldContinuous(grid_mode='custom',
                                        layout=np.load('maze1_layout.npy'),
                                        objects=np.load('maze1_objects.npy'),
                                        waypoint=True,
                                        terrain=self.config.terrain_var)
        self.grid.goal = (13., 13.)
        self.grid.reset_env_terrain()

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.episode_steps = 0

    def roll_out_in_env(self, start, goal, ultimate_goal, horizon, mode='test'):
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        roll_out = Batch()
        break_var = False
        s = start
        d = False

        s_list = []
        a_list = []
        r_list = []

        for step_i in range(horizon):
            self.episode_steps += 1
            s_bar = self.grid.lls2hls(s)

            # GET THE TARGET VECTOR
            target_vec = self.grid.goal_management.get_target_vec(s[:2])
            s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                                            list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, _, info = self.grid.step(action=10*a)
            s_bar_cand = self.grid.lls2hls(s_new)

            success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            break_var_rrt = self.grid.goal_management.update_way_point(self.grid, (s_new[0], s_new[1]), d)
            break_var = break_var or break_var_rrt or (not info) or (step_i + 1 == horizon)

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new.astype(np.float32)],
                      [0 if break_var else 1],
                      [info],
                      [1.0]))
            s = s_new
            if break_var:
                break

        return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:

                """ INITIALIZE THE ENVIRONMENT """
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos, goal_pos)
                self.episode_steps = 0

                path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid)
                self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0]

                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                success = False

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    curr_goal = \
                        self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        horizon=horizon_left,
                        ultimate_goal=goal_pos,
                        mode='train'
                    )

                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

                    batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos, goal_pos)
            self.episode_steps = 0

            path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid)
            self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0]

            """ START THE EPISODE """
            horizon_left = self.config.time_horizon
            success = False

            while (horizon_left > 0) and not success:
                # GET THE TARGET VECTOR
                curr_goal = \
                    self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    horizon=horizon_left,
                    ultimate_goal=goal_pos,
                    mode='test'
                )

                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

            return success

    def train(self):
        for iter_loop in range(self.config.num_inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success

示例#5

显示文件

class TRPOTrainer:
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None):
        self.config = config
        self.save_path = save_path
        self.grid = GridWorldContinuous(grid_mode='custom',
                                        layout=np.load('maze1_layout.npy'),
                                        objects=np.load('maze1_objects.npy'),
                                        terrain=self.config.terrain_var)
        self.grid.reset_env_terrain()

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)
        self.mvprop = MVPROPFAT(k=config.mvprop_k).cuda()
        self.mvprop_target = copy.deepcopy(self.mvprop).cuda()
        self.memory = ReplayBufferVIN(2, 1, self.grid.x_size,
                                      config.mvprop_buffer_size)
        self.mvprop_optimizer = MVPROPOptimizerTD0(
            self.mvprop, self.mvprop_target, self.memory,
            config.discount_factor, config.mvprop_batch_size, config.mvprop_lr,
            config.mvprop_k)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0

    def bound_x(self, input_var):
        return max(0, min(input_var, self.grid.x_size - 1))

    def bound_y(self, input_var):
        return max(0, min(input_var, self.grid.y_size - 1))

    def roll_out_in_env(self,
                        start,
                        goal,
                        ultimate_goal,
                        horizon,
                        mode='test'):
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        roll_out = Batch()
        break_var = False
        s = start
        d = False

        s_list = []
        a_list = []
        r_list = []

        for step_i in range(horizon):
            self.episode_steps += 1
            s_bar = self.grid.lls2hls(s)

            # GET THE TARGET VECTOR
            target_vec = (goal[0] - s[0], goal[1] - s[1])
            s_save = copy.deepcopy(
                np.array(
                    list(s[2:]) +
                    list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, d_wp, info = self.grid.step(action=10 * a)
            s_bar_cand = self.grid.lls2hls(s_new)

            d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1])
            if d:
                info = False
                r = 1.0

            success_var = ((s_bar_cand[0] == s_u_goal[0])
                           and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (
                self.episode_steps == self.config.time_horizon)

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new.astype(np.float32)], [0 if break_var else 1],
                      [info], [1.0]))
            s = s_new
            if break_var:
                break

        return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos, goal_pos)
                self.episode_steps = 0
                """ IMAGE INPUT """
                image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size))
                image[0, 0, :, :] = self.grid.terrain_map_un_padded
                image[0, 1, :, :] = -1 * np.ones(
                    (self.grid.x_size, self.grid.y_size))
                image[0, 1, s_goal[0], s_goal[1]] = 0
                image = torch.from_numpy(image).float().cuda()

                with torch.no_grad():
                    v = self.mvprop_optimizer.mvprop(image)
                    v = v.cpu().detach()
                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                st = start_pos
                success = False

                s_bar = self.grid.lls2hls(st)
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    if self.config.mvprop_decay_type == 'exp':
                        self.eps = 0.01 + 0.99 * math.exp(
                            -1. * self.dqn_steps / self.config.mvprop_decay)

                    if torch.rand(1)[0] > self.eps:
                        with torch.no_grad():
                            options_x = [
                                s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1,
                                s_bar[0] + 1, s_bar[0], s_bar[0] - 1,
                                s_bar[0] - 1, s_bar[0] - 1
                            ]
                            options_y = [
                                s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1],
                                s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1,
                                s_bar[1], s_bar[1] + 1
                            ]
                            options_x = [self.bound_x(e) for e in options_x]
                            options_y = [self.bound_y(e) for e in options_y]
                            v_options = v[0, 0, options_x, options_y]
                            option = np.argmax(v_options)
                    else:
                        option = randint(0, 8)

                    if option == 0:
                        target = (s_bar[0], s_bar[1])
                    elif option == 1:
                        target = (s_bar[0], s_bar[1] + 1)
                    elif option == 2:
                        target = (s_bar[0] + 1, s_bar[1] + 1)
                    elif option == 3:
                        target = (s_bar[0] + 1, s_bar[1])
                    elif option == 4:
                        target = (s_bar[0] + 1, s_bar[1] - 1)
                    elif option == 5:
                        target = (s_bar[0], s_bar[1] - 1)
                    elif option == 6:
                        target = (s_bar[0] - 1, s_bar[1] - 1)
                    elif option == 7:
                        target = (s_bar[0] - 1, s_bar[1])
                    elif option == 8:
                        target = (s_bar[0] - 1, s_bar[1] + 1)
                    target = (max(0, min(target[0], self.grid.x_size - 1)),
                              max(0, min(target[1], self.grid.y_size - 1)))

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=target,
                        horizon=self.time_scale,
                        ultimate_goal=goal_pos,
                        mode='train')

                    s_init = l_state

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(option)

                    st = l_state[:2]
                    s_bar = self.grid.lls2hls(st)

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))
                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                total_success += success
                j += 1

                ### ADD TRANSITIONS TO BUFFER
                for ep_idx in range(len(hl_a_list)):

                    self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx],
                                    hl_s_list[ep_idx + 1], hl_r_list[ep_idx],
                                    hl_d_list[ep_idx], image)

                    if self.config.mvprop_her:
                        ### GET THE HINDSIGHT GOAL TRANSITION
                        image_her = np.zeros(
                            (1, 2, self.grid.x_size, self.grid.y_size))
                        image_her[0, 0, :, :] = self.grid.terrain_map_un_padded
                        image_her[0, 1, :, :] = -1 * np.ones(
                            (self.grid.x_size, self.grid.y_size))
                        image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1]] = 0
                        image_her = torch.from_numpy(image_her).float().cuda()

                        if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and (
                                hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]):
                            hgt_reward = 0
                            hgt_done = True
                        else:
                            hgt_reward = -1
                            hgt_done = False

                        self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx],
                                        hl_s_list[ep_idx + 1], hgt_reward,
                                        hgt_done, image_her)

                ### OPTIMIZE NETWORK PARAMETERS
                for _ in range(40):
                    self.mvprop_optimizer.train(self.config.time_horizon /
                                                self.time_scale)

                # TARGET NET UPDATE
                if self.dqn_steps % self.config.mvprop_target_update_frequency == 0:
                    tau = 0.05
                    for param, target_param in zip(
                            self.mvprop_optimizer.mvprop.parameters(),
                            self.mvprop_optimizer.target_mvprop.parameters()):
                        target_param.data.copy_(tau * param.data +
                                                (1 - tau) * target_param.data)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos, goal_pos)
            self.episode_steps = 0

            image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size))
            image[0, 0, :, :] = self.grid.terrain_map_un_padded
            image[0, 1, :, :] = -1 * np.ones(
                (self.grid.x_size, self.grid.y_size))
            image[0, 1, s_goal[0], s_goal[1]] = 0
            image = torch.from_numpy(image).float().cuda()

            with torch.no_grad():
                v = self.mvprop_optimizer.target_mvprop(image)
                v = v.cpu().detach()

            horizon_left = self.config.time_horizon
            st = start_pos
            success = False

            s_bar = self.grid.lls2hls(st)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                with torch.no_grad():
                    options_x = [
                        s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1,
                        s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1,
                        s_bar[0] - 1
                    ]
                    options_y = [
                        s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1],
                        s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1],
                        s_bar[1] + 1
                    ]
                    options_x = [self.bound_x(e) for e in options_x]
                    options_y = [self.bound_y(e) for e in options_y]
                    v_options = v[0, 0, options_x, options_y]
                    option = np.argmax(v_options)

                if option == 0:
                    target = (s_bar[0], s_bar[1])
                elif option == 1:
                    target = (s_bar[0], s_bar[1] + 1)
                elif option == 2:
                    target = (s_bar[0] + 1, s_bar[1] + 1)
                elif option == 3:
                    target = (s_bar[0] + 1, s_bar[1])
                elif option == 4:
                    target = (s_bar[0] + 1, s_bar[1] - 1)
                elif option == 5:
                    target = (s_bar[0], s_bar[1] - 1)
                elif option == 6:
                    target = (s_bar[0] - 1, s_bar[1] - 1)
                elif option == 7:
                    target = (s_bar[0] - 1, s_bar[1])
                elif option == 8:
                    target = (s_bar[0] - 1, s_bar[1] + 1)
                target = (max(0, min(target[0], self.grid.x_size - 1)),
                          max(0, min(target[1], self.grid.y_size - 1)))

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=target,
                    horizon=self.time_scale,
                    ultimate_goal=goal_pos,
                    mode='test')

                s_init = l_state

                st = l_state[:2]
                s_bar = self.grid.lls2hls(st)

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

            return success

    def train(self):
        for iter_loop in range(self.config.num_inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)
            print(self.memory.size)
            print(self.eps)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success