def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None
                 ):
        self.config = config
        self.save_path = save_path
        self.grid = GridWorldContinuous(grid_mode='custom',
                                        layout=np.load('maze1_layout.npy'),
                                        objects=np.load('maze1_objects.npy'),
                                        waypoint=True,
                                        terrain=self.config.terrain_var)
        self.grid.goal = (13., 13.)
        self.grid.reset_env_terrain()

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.episode_steps = 0
    def __init__(self, iter_num, iter_size, state_dim, action_dim, max_action,
                 batch_size, discount, tau, expl_noise, policy_noise,
                 noise_clip, policy_freq, max_iter, start_time_steps,
                 total_iter, terrain_var, her_var):

        self.iter_num = iter_num
        self.iter_size = iter_size

        kwargs = {
            "state_dim": state_dim,
            "action_dim": action_dim,
            "max_action": max_action,
            "discount": discount,
            "tau": tau,
            "policy_noise": policy_noise * max_action,
            "noise_clip": noise_clip * max_action,
            "policy_freq": policy_freq
        }

        self.action_dim = action_dim

        self.grid = GridWorldContinuous(grid_mode='standard',
                                        terrain=terrain_var)
        self.grid.reset_env_terrain()

        self.manager_policy = TD3(**kwargs)
        self.replay_buffer = utils.ReplayBufferHIRO(state_dim,
                                                    action_dim,
                                                    max_size=int(2e5))

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=[64, 64, 64],
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=5e-4,
                              damping=5e-3,
                              use_fim=False,
                              discount=0.99,
                              imp_weight=False)

        self.manager = Manager()

        self.batch_size = batch_size
        self.expl_noise = expl_noise
        self.expl_noise_start = expl_noise
        self.max_action = max_action

        self.total_steps = 0
        self.max_iter = max_iter
        self.start_time_steps = start_time_steps

        self.manager_time_scale = 2

        self.current_iter = 0
        self.total_iter = total_iter

        self.her_var = her_var
Пример #3
0
    def __init__(self,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 use_fim=False,
                 use_gpu=False):

        self.grid = ParkingEmpty()
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)

        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim
Пример #4
0
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None):
        self.config = config
        self.save_path = save_path
        self.grid = Ant44Env0(mj_ant_path=self.config.mj_ant_path)

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)
        self.buffer = Buffer()
        self.vi = ValueIteration4M1SyncNN8(grid_size_x=24,
                                           grid_size_y=24,
                                           gamma=self.optimizer.discount)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 20
        self.episode_steps = 0
    def __init__(self,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 use_fim=False,
                 use_gpu=False
                 ):

        self.grid = ParkingEmpty()
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)
        self.mvprop = MVPROPFAT3D(k=100).cuda()
        self.mvprop_target = copy.deepcopy(self.mvprop).cuda()
        self.memory = ReplayBufferVIN3D(3, 1, self.grid.x_size, 35000)
        self.mvprop_optimizer = MVPROPOptimizer3D(self.mvprop,
                                                  self.mvprop_target,
                                                  self.memory,
                                                  0.99,
                                                  128,
                                                  3e-4,
                                                  100)

        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0
Пример #6
0
    def __init__(self,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 optimistic_model,
                 use_fim=False,
                 use_gpu=False):

        self.grid = ParkingEmpty()
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)
        self.buffer = Buffer()
        gx = self.grid.x_size
        gy = self.grid.y_size
        self.vi = ValueIteration33SyncNN(grid_size_x=gx,
                                         grid_size_y=gy,
                                         gamma=self.optimizer.discount)

        self.op_mode = op_mode
        self.optimistic_model = optimistic_model
        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0
Пример #7
0
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None):
        self.config = config
        self.save_path = save_path
        self.grid = GridWorldContinuous(grid_mode='custom',
                                        layout=np.load('maze1_layout.npy'),
                                        objects=np.load('maze1_objects.npy'),
                                        terrain=self.config.terrain_var)
        self.grid.reset_env_terrain()

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)
        self.mvprop = MVPROPFAT(k=config.mvprop_k).cuda()
        self.mvprop_target = copy.deepcopy(self.mvprop).cuda()
        self.memory = ReplayBufferVIN(2, 1, self.grid.x_size,
                                      config.mvprop_buffer_size)
        self.mvprop_optimizer = MVPROPOptimizerTD0(
            self.mvprop, self.mvprop_target, self.memory,
            config.discount_factor, config.mvprop_batch_size, config.mvprop_lr,
            config.mvprop_k)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0
    def __init__(
        self,
        env_mode,
        op_mode,
        state_dim,
        action_dim,
        hidden_sizes,
        max_kl,
        damping,
        batch_size,
        inner_episodes,
        max_iter,
        use_fim=False,
        use_gpu=False,
        mj_ant_path=None,
    ):

        self.mj_ant_path = mj_ant_path
        self.grid = Ant44Env0(mj_ant_path=self.mj_ant_path, waypoint=True)

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim
Пример #9
0
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None):
        self.config = config
        self.save_path = save_path
        self.grid = Ant44Env0(mj_ant_path=self.config.mj_ant_path)
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)
        self.mvprop = MVPROPFAT(k=config.mvprop_k).cuda()
        self.mvprop_target = copy.deepcopy(self.mvprop).cuda()
        self.memory = ReplayBufferVIN(2, 1, self.grid.x_size,
                                      config.mvprop_buffer_size)
        self.mvprop_optimizer = MVPROPOptimizerTD0(
            self.mvprop, self.mvprop_target, self.memory,
            config.discount_factor, config.mvprop_batch_size, config.mvprop_lr,
            config.mvprop_k)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 20
        self.episode_steps = 0
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 use_fim=False,
                 use_gpu=False,
                 terrain_var=False
                 ):

        self.grid = GridWorldContinuous(grid_mode='standard', terrain=terrain_var)
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim
Пример #11
0
class HIROTrainer:
    def __init__(self,
                 iter_num,
                 iter_size,
                 state_dim,
                 action_dim,
                 max_action,
                 batch_size,
                 discount,
                 tau,
                 expl_noise,
                 policy_noise,
                 noise_clip,
                 policy_freq,
                 max_iter,
                 start_time_steps,
                 total_iter
                 ):

        self.iter_num = iter_num
        self.iter_size = iter_size

        kwargs = {
            "state_dim": state_dim,
            "action_dim": action_dim,
            "max_action": max_action,
            "discount": discount,
            "tau": tau,
            "policy_noise": policy_noise * max_action,
            "noise_clip": noise_clip * max_action,
            "policy_freq": policy_freq
        }

        self.action_dim = action_dim

        self.grid = ParkingEmpty()

        self.manager_policy = TD3(**kwargs)
        self.replay_buffer = utils.ReplayBufferHIROParkingAngleNew(state_dim, action_dim, max_size=int(2e5))

        self.policy = ContinuousMLP(6,
                                    action_dim,
                                    hidden_sizes=[64, 64, 64],
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=5e-4,
                              damping=5e-3,
                              use_fim=False,
                              discount=0.99,
                              imp_weight=False)

        self.manager = ManagerAngle()

        self.batch_size = batch_size
        self.expl_noise = expl_noise
        self.expl_noise_start = expl_noise
        self.max_action = max_action

        self.total_steps = 0
        self.max_iter = max_iter
        self.start_time_steps = start_time_steps

        self.manager_time_scale = 2

        self.current_iter = 0
        self.total_iter = total_iter

    def roll_out_in_env(self, horizon, mode='train'):
        roll_out = Batch()
        oob = False
        s = self.grid.reset()
        s_list = []
        a_list = []
        r_list = []

        state_seq = []
        action_seq = []

        ultimate_start_bar = self.grid.lls2hls(s['observation'])
        goal_bar = self.grid.lls2hls(s['desired_goal'])

        s_manager = copy.deepcopy(np.concatenate((s['observation'], s['desired_goal'])))
        r_manager = 0.
        if mode == 'train':
            if self.total_steps < self.start_time_steps:
                sub_goal = np.concatenate((0.08 * np.random.random((2, )) - 0.04, 2 * math.pi * np.random.random((1, )) - math.pi))
            else:
                sub_goal = (self.manager_policy.select_action(s_manager) +
                            np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\
                    clip(-self.max_action, self.max_action)
        else:
            sub_goal = self.manager_policy.select_action(s_manager)

        self.manager.update(s=copy.deepcopy(s['observation']), sg=sub_goal)

        for step_i in range(horizon):

            if mode == 'train':
                self.total_steps += 1

            manager_target = self.manager.target(s['observation'])
            s_save = copy.deepcopy(
                np.array(list(manager_target[:2]) + list(s['observation'][2:4]) + list(manager_target[2:4])))
            s_list.append(s_save)

            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            state_seq.append(copy.deepcopy(np.array(list(manager_target[:2]) + list(s['observation'][2:4]) +
                                                    self.manager.target_ang(s['observation']))))

            action_seq.append(a)

            s_new, r, d, info = self.grid.env.step(a)
            info = info["is_success"]
            r = self.manager.reward(s_new['observation'])
            a_list.append(a)
            r_list.append(r)
            r_manager += float(info)

            manager_update = (step_i + 1) % self.manager_time_scale == 0

            ib = self.grid.check_in_bounds(s_new['observation'])
            if not ib:
                oob = True

            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new['observation'].astype(np.float32)],
                      [0 if ((step_i + 1 == horizon) or info or oob or manager_update) else 1],
                      [not info],
                      [1.0]))

            s = s_new

            if manager_update or info:
                self.total_steps += 1
                s_new_manager = copy.deepcopy(np.concatenate((s['observation'], s['desired_goal'])))

                if mode == 'train':
                    self.replay_buffer.add(s_manager, self.manager.a, s_new_manager, r_manager, info,
                                           np.array(state_seq), np.array(action_seq))

                    s_manager_her = np.concatenate((s_manager[:6], s['observation']))
                    s_new_manager_her = np.concatenate((s_new_manager[:6], s['observation']))
                    self.replay_buffer.add(s_manager_her, self.manager.a, s_new_manager_her, 1.0, True,
                                           np.array(state_seq), np.array(action_seq))

                    state_seq = []
                    action_seq = []

                s_manager = s_new_manager
                r_manager = 0.

                if mode == 'train':
                    if self.total_steps < self.start_time_steps:
                        sub_goal = np.concatenate(
                            (0.08 * np.random.random((2,)) - 0.04, 2 * math.pi * np.random.random((1,)) - math.pi))
                    else:
                        sub_goal = (self.manager_policy.select_action(s_manager) +
                                    np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \
                            clip(-self.max_action, self.max_action)
                else:
                    sub_goal = self.manager_policy.select_action(s_manager)

                self.manager.update(s=copy.deepcopy(s['observation']), sg=sub_goal)

            if info or oob:
                break
        manager_target = self.manager.target(s['observation'])
        s_save = copy.deepcopy(np.array(list(manager_target[:2]) + list(s['observation'][2:4]) +
                                        list(manager_target[2:4])))
        s_list.append(s_save)
        return roll_out, step_i + 1, s_list, a_list, r_list, info, ultimate_start_bar, goal_bar

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0

        if mode == 'train':
            while num_steps < self.iter_size:

                roll_out, steps, states, actions, rewards, success, start_pos, goal_pos = self.roll_out_in_env(
                    horizon=self.max_iter,
                    mode='train')

                if self.total_steps > self.start_time_steps:
                    for _ in range(40):
                        self.manager_policy.train(self.replay_buffer, self.batch_size)

                batch.append(roll_out)
                num_roll_outs += 1
                num_steps += steps

                total_success += success

            return batch, total_success / num_roll_outs, num_steps / num_roll_outs

        else:
            _, steps, states, actions, rewards, success, start_pos, goal_pos = self.roll_out_in_env(
                horizon=self.max_iter,
                mode='test')

        return success

    def train(self):
        for iter_loop in range(self.iter_num):
            self.current_iter += 1
            self.expl_noise = max((1 - (self.current_iter / (self.total_iter))) * self.expl_noise_start, 0)
            batch, g_success, steps_trajectory = self.simulate_env(mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("OC")
            print(self.replay_buffer.size)
            start_t = time.time()
            self.replay_buffer.off_policy_correction(policy=self.policy)
            print(time.time() - start_t)
            print("STATS")
            print(g_success)
            print(steps_trajectory)

    def test(self):
        success = self.simulate_env(mode='test')
        return success
class TRPOTrainer:
    def __init__(
        self,
        env_mode,
        op_mode,
        state_dim,
        action_dim,
        hidden_sizes,
        max_kl,
        damping,
        batch_size,
        inner_episodes,
        max_iter,
        use_fim=False,
        use_gpu=False,
        mj_ant_path=None,
    ):

        self.mj_ant_path = mj_ant_path
        self.grid = Ant44Env0(mj_ant_path=self.mj_ant_path, waypoint=True)

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim

    def roll_out_in_env(self, start, goal, ultimate_goal, horizon):
        roll_out = Batch()
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        s = start

        s_list = []
        a_list = []
        r_list = []
        d = False

        for step_i in range(horizon):
            s_bar = self.grid.lls2hls(s)
            target_vec = self.grid.goal_management.get_target_vec(s[:2])

            s_save = copy.deepcopy(
                np.array(
                    list(s[2:]) +
                    list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, info = self.grid.step(a, goal)
            info = info['no_goal_reached']

            dist_wp = np.sqrt((s_new[0] - self.grid.goal_management.way_points[
                self.grid.goal_management.way_point_current][0])**2 + (
                    s_new[1] - self.grid.goal_management.way_points[
                        self.grid.goal_management.way_point_current][1])**2)
            d_wp = dist_wp < 0.5
            new_distance = dist_wp
            d = np.sqrt((s_new[0] - goal[0])**2 +
                        (s_new[1] - goal[1])**2) < 0.5
            if d:
                info = False
            else:
                info = True
            r = d_wp  # + self.grid.old_distance - 0.99 * new_distance
            self.grid.old_distance = new_distance

            s_bar_cand = self.grid.lls2hls(s_new)

            break_var = self.grid.goal_management.update_way_point(
                self.grid, (s_new[0], s_new[1]), d)
            success_var = ((s_bar_cand[0] == s_u_goal[0])
                           and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new.astype(np.float32)], [
                          0 if ((not info) or
                                (step_i + 1 == horizon) or break_var) else 1
                      ], [info], [1.0]))
            s = s_new
            if (not info) or break_var:
                break
        return roll_out, s_list, a_list, r_list, d, s_new

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
                structure = copy.deepcopy(a)
                objects = copy.deepcopy(b)
                obstacle_list = copy.deepcopy(c)
                occupancy_map = copy.deepcopy(d)
                default_starts = copy.deepcopy(e)
                state_cache = copy.deepcopy(f)
                occupancy_map_padded = copy.deepcopy(g)
                occupancy_map_un_padded = copy.deepcopy(h)
                occupancy_map_original = copy.deepcopy(p)

                self.grid = Ant44Env0(
                    mj_ant_path=self.mj_ant_path,
                    re_init=True,
                    structure=structure,
                    objects=objects,
                    obstacle_list=obstacle_list,
                    occupancy_map=occupancy_map,
                    default_starts=default_starts,
                    state_cache=state_cache,
                    occupancy_map_padded=occupancy_map_padded,
                    occupancy_map_un_padded=occupancy_map_un_padded,
                    occupancy_map_original=occupancy_map_original,
                    waypoint=True)

                start_pos = self.grid.sample_random_pos(number=1)[0]
                goal_pos = self.grid.sample_random_pos(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos)

                horizon_left = self.max_iter
                success = False

                path_segment_len_list = self.grid.goal_management.reset(
                    start_pos, goal_pos, self.grid)
                self.grid.old_distance = self.grid.goal_management.path_segment_len_list[
                    0]

                while (horizon_left > 0) and not success:

                    curr_goal = \
                        self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                    roll_out, states, actions, rewards, wp_success, l_state = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        ultimate_goal=goal_pos,
                        horizon=horizon_left)

                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))

                    batch.append(roll_out)

                total_success += success
                j += 1

                ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
                tree = ET.parse(ant_path)
                tree.write(self.grid.mj_ant_path + 'ant.xml')

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
            structure = copy.deepcopy(a)
            objects = copy.deepcopy(b)
            obstacle_list = copy.deepcopy(c)
            occupancy_map = copy.deepcopy(d)
            default_starts = copy.deepcopy(e)
            state_cache = copy.deepcopy(f)
            occupancy_map_padded = copy.deepcopy(g)
            occupancy_map_un_padded = copy.deepcopy(h)
            occupancy_map_original = copy.deepcopy(p)

            self.grid = Ant44Env0(
                mj_ant_path=self.mj_ant_path,
                re_init=True,
                structure=structure,
                objects=objects,
                obstacle_list=obstacle_list,
                occupancy_map=occupancy_map,
                default_starts=default_starts,
                state_cache=state_cache,
                occupancy_map_padded=occupancy_map_padded,
                occupancy_map_un_padded=occupancy_map_un_padded,
                occupancy_map_original=occupancy_map_original,
                waypoint=True)

            start_pos = self.grid.sample_random_pos(number=1)[0]
            goal_pos = self.grid.sample_random_pos(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos)

            horizon_left = self.max_iter
            success = False

            path_segment_len_list = self.grid.goal_management.reset(
                start_pos, goal_pos, self.grid)
            self.grid.old_distance = self.grid.goal_management.path_segment_len_list[
                0]

            while (horizon_left > 0) and not success:

                curr_goal = \
                    self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                roll_out, states, actions, rewards, wp_success, l_state = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    ultimate_goal=goal_pos,
                    horizon=horizon_left)

                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

                batch.append(roll_out)

            total_success += success
            j += 1

            ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
            tree = ET.parse(ant_path)
            tree.write(self.grid.mj_ant_path + 'ant.xml')

            return success

    def train(self):
        for iter_loop in range(self.inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success
Пример #13
0
class TRPOTrainer:
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None):
        self.config = config
        self.save_path = save_path
        self.grid = GridWorldContinuous(grid_mode='custom',
                                        layout=np.load('maze1_layout.npy'),
                                        objects=np.load('maze1_objects.npy'),
                                        terrain=self.config.terrain_var)
        self.grid.goal = (13., 13.)
        self.grid.reset_env_terrain()

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)
        self.buffer = Buffer()
        self.vi = ValueIteration4M1SyncNN8(grid_size_x=21,
                                           grid_size_y=21,
                                           gamma=self.optimizer.discount)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0

    def bound_x(self, input_var):
        return max(0, min(input_var, self.grid.x_size - 1))

    def bound_y(self, input_var):
        return max(0, min(input_var, self.grid.y_size - 1))

    def roll_out_in_env(self,
                        start,
                        goal,
                        ultimate_goal,
                        horizon,
                        mode='test'):
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        roll_out = Batch()
        break_var = False
        s = start
        d = False

        s_list = []
        a_list = []
        r_list = []

        for step_i in range(horizon):
            self.episode_steps += 1
            s_bar = self.grid.lls2hls(s)

            # GET THE TARGET VECTOR
            target_vec = (goal[0] - s[0], goal[1] - s[1])
            s_save = copy.deepcopy(
                np.array(
                    list(s[2:]) +
                    list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, _, info = self.grid.step(action=10 * a)
            s_bar_cand = self.grid.lls2hls(s_new)

            d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1])
            if d:
                info = False
                r = 1.0

            success_var = ((s_bar_cand[0] == s_u_goal[0])
                           and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (
                self.episode_steps == self.config.time_horizon)

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new.astype(np.float32)], [0 if break_var else 1],
                      [info], [1.0]))
            s = s_new
            if break_var:
                break

        return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos, goal_pos)
                self.episode_steps = 0
                """ V MAP """
                if self.config.optimistic_model:
                    self.vi.update_p_table_optimistic(
                        occupancy_map=self.grid.occupancy_map, walls=True)
                else:
                    self.vi.update_p_table(occupancy_map=self.grid.terrain_map,
                                           walls=True)

                v, pi = self.vi.run_vi(grid=self.grid,
                                       goal=(s_goal[0], s_goal[1]))
                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                st = start_pos
                success = False

                s_bar = self.grid.lls2hls(st)
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(
                        -1. * self.dqn_steps / 10000)

                    s_bar = self.grid.lls2hls(st)

                    if torch.rand(1)[0] > self.eps:
                        a_bar = int(pi[s_bar[0], s_bar[1]])
                    else:
                        a_bar = randint(0, 7)

                    self.vi.set_target(s_bar, a_bar)
                    curr_goal = self.vi.get_target()

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        horizon=self.time_scale,
                        ultimate_goal=goal_pos,
                        mode='train')

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(a_bar)

                    st = l_state[:2]
                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))
                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                total_success += success
                j += 1

                if not self.config.optimistic_model:
                    x_temp, y_temp, w_temp = self.vi.generate_dataset_flat(
                        self.grid.terrain_map, hl_s_list, hl_a_list)

                    for bi in range(x_temp.shape[0]):
                        self.buffer.add(x_temp[bi], y_temp[bi], w_temp[bi])

                    self.vi.train_net(buffer=self.buffer,
                                      bs=128,
                                      opt_iterations=40,
                                      rw=True)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos, goal_pos)
            self.episode_steps = 0
            """ V MAP """
            if self.config.optimistic_model:
                self.vi.update_p_table_optimistic(
                    occupancy_map=self.grid.occupancy_map, walls=True)
            else:
                self.vi.update_p_table(occupancy_map=self.grid.terrain_map,
                                       walls=True)

            v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1]))

            horizon_left = self.config.time_horizon
            st = start_pos
            success = False

            s_bar = self.grid.lls2hls(st)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                a_bar = int(pi[s_bar[0], s_bar[1]])
                self.vi.set_target(s_bar, a_bar)
                curr_goal = self.vi.get_target()

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    horizon=self.time_scale,
                    ultimate_goal=goal_pos,
                    mode='test')

                st = l_state[:2]
                s_bar = self.grid.lls2hls(st)
                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

            return success

    def train(self):
        for iter_loop in range(self.config.num_inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)
            print(self.buffer.size)
            print(self.eps)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success
Пример #14
0
class TRPOTrainer:
    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None):
        self.config = config
        self.save_path = save_path
        self.grid = Ant44Env0(mj_ant_path=self.config.mj_ant_path)
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)
        self.mvprop = MVPROPFAT(k=config.mvprop_k).cuda()
        self.mvprop_target = copy.deepcopy(self.mvprop).cuda()
        self.memory = ReplayBufferVIN(2, 1, self.grid.x_size,
                                      config.mvprop_buffer_size)
        self.mvprop_optimizer = MVPROPOptimizerTD0(
            self.mvprop, self.mvprop_target, self.memory,
            config.discount_factor, config.mvprop_batch_size, config.mvprop_lr,
            config.mvprop_k)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 20
        self.episode_steps = 0

    def bound_x(self, input_var):
        return max(0, min(input_var, self.grid.x_size - 1))

    def bound_y(self, input_var):
        return max(0, min(input_var, self.grid.y_size - 1))

    def roll_out_in_env(self, start, goal, ultimate_goal, horizon):
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        start_bar = self.grid.lls2hls(start)
        roll_out = Batch()
        break_var = False
        s = start
        d = False

        s_list = []
        a_list = []
        r_list = []

        for step_i in range(horizon):
            self.episode_steps += 1
            s_bar = self.grid.lls2hls(s)

            # GET THE TARGET VECTOR
            target_vec = (goal[0] - s[0], goal[1] - s[1])
            s_save = copy.deepcopy(
                np.array(
                    list(s[2:]) +
                    list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, info = self.grid.step(a, goal)
            info = info['no_goal_reached']
            # info = True

            s_bar_cand = self.grid.lls2hls(s_new)

            d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1])
            if d:
                info = False
                r = 1.0

            success_var = ((s_bar_cand[0] == s_u_goal[0])
                           and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (
                self.episode_steps == self.config.time_horizon)

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new.astype(np.float32)], [0 if break_var else 1],
                      [info], [1.0]))
            s = s_new
            if break_var:
                break

        return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
                structure = copy.deepcopy(a)
                objects = copy.deepcopy(b)
                obstacle_list = copy.deepcopy(c)
                occupancy_map = copy.deepcopy(d)
                default_starts = copy.deepcopy(e)
                state_cache = copy.deepcopy(f)
                occupancy_map_padded = copy.deepcopy(g)
                occupancy_map_un_padded = copy.deepcopy(h)
                occupancy_map_original = copy.deepcopy(p)

                self.grid = Ant44Env0(
                    mj_ant_path=self.config.mj_ant_path,
                    re_init=True,
                    structure=structure,
                    objects=objects,
                    obstacle_list=obstacle_list,
                    occupancy_map=occupancy_map,
                    default_starts=default_starts,
                    state_cache=state_cache,
                    occupancy_map_padded=occupancy_map_padded,
                    occupancy_map_un_padded=occupancy_map_un_padded,
                    occupancy_map_original=occupancy_map_original)

                start_pos = self.grid.sample_random_pos(number=1)[0]
                goal_pos = self.grid.sample_random_pos(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                self.episode_steps = 0
                s_init = self.grid.reset(start_pos)
                """ IMAGE INPUT """
                image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size))
                image[0, 0, :, :] = self.grid.occupancy_map_un_padded
                image[0, 1, :, :] = -1 * np.ones(
                    (self.grid.x_size, self.grid.y_size))
                image[0, 1, s_goal[0], s_goal[1]] = 0
                image = torch.from_numpy(image).float().cuda()

                with torch.no_grad():
                    v = self.mvprop_optimizer.mvprop(image)
                    v = v.cpu().detach()
                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                st = start_pos
                success = False

                s_bar = self.grid.lls2hls(st)
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    if self.config.mvprop_decay_type == 'exp':
                        self.eps = 0.01 + 0.99 * math.exp(
                            -1. * self.dqn_steps / self.config.mvprop_decay)

                    if torch.rand(1)[0] > self.eps:
                        with torch.no_grad():
                            options_x = [
                                s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1,
                                s_bar[0] + 1, s_bar[0], s_bar[0] - 1,
                                s_bar[0] - 1, s_bar[0] - 1
                            ]
                            options_y = [
                                s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1],
                                s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1,
                                s_bar[1], s_bar[1] + 1
                            ]
                            options_x = [self.bound_x(e) for e in options_x]
                            options_y = [self.bound_y(e) for e in options_y]
                            v_options = v[0, 0, options_x, options_y]
                            option = np.argmax(v_options)
                    else:
                        option = randint(0, 8)

                    if option == 0:
                        target = (s_bar[0], s_bar[1])
                    elif option == 1:
                        target = (s_bar[0], s_bar[1] + 1)
                    elif option == 2:
                        target = (s_bar[0] + 1, s_bar[1] + 1)
                    elif option == 3:
                        target = (s_bar[0] + 1, s_bar[1])
                    elif option == 4:
                        target = (s_bar[0] + 1, s_bar[1] - 1)
                    elif option == 5:
                        target = (s_bar[0], s_bar[1] - 1)
                    elif option == 6:
                        target = (s_bar[0] - 1, s_bar[1] - 1)
                    elif option == 7:
                        target = (s_bar[0] - 1, s_bar[1])
                    elif option == 8:
                        target = (s_bar[0] - 1, s_bar[1] + 1)
                    target = (max(0, min(target[0], self.grid.x_size - 1)),
                              max(0, min(target[1], self.grid.y_size - 1)))

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=target,
                        horizon=self.time_scale,
                        ultimate_goal=goal_pos)

                    s_init = l_state

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(option)

                    st = l_state[:2]
                    s_bar = self.grid.lls2hls(st)

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))
                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                total_success += success
                j += 1

                ### ADD TRANSITIONS TO BUFFER
                for ep_idx in range(len(hl_a_list)):

                    self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx],
                                    hl_s_list[ep_idx + 1], hl_r_list[ep_idx],
                                    hl_d_list[ep_idx], image)

                    if self.config.mvprop_her:
                        ### GET THE HINDSIGHT GOAL TRANSITION
                        image_her = np.zeros(
                            (1, 2, self.grid.x_size, self.grid.y_size))
                        image_her[0,
                                  0, :, :] = self.grid.occupancy_map_un_padded
                        image_her[0, 1, :, :] = -1 * np.ones(
                            (self.grid.x_size, self.grid.y_size))
                        image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1]] = 0
                        image_her = torch.from_numpy(image_her).float().cuda()

                        if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and (
                                hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]):
                            hgt_reward = 0
                            hgt_done = True
                        else:
                            hgt_reward = -1
                            hgt_done = False

                        self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx],
                                        hl_s_list[ep_idx + 1], hgt_reward,
                                        hgt_done, image_her)

                ### OPTIMIZE NETWORK PARAMETERS
                for _ in range(40):
                    self.mvprop_optimizer.train(self.config.time_horizon /
                                                self.time_scale)

                # TARGET NET UPDATE
                if self.dqn_steps % self.config.mvprop_target_update_frequency == 0:
                    tau = 0.05
                    for param, target_param in zip(
                            self.mvprop_optimizer.mvprop.parameters(),
                            self.mvprop_optimizer.target_mvprop.parameters()):
                        target_param.data.copy_(tau * param.data +
                                                (1 - tau) * target_param.data)

                ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
                tree = ET.parse(ant_path)
                tree.write(self.grid.mj_ant_path + 'ant.xml')

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
            structure = copy.deepcopy(a)
            objects = copy.deepcopy(b)
            obstacle_list = copy.deepcopy(c)
            occupancy_map = copy.deepcopy(d)
            default_starts = copy.deepcopy(e)
            state_cache = copy.deepcopy(f)
            occupancy_map_padded = copy.deepcopy(g)
            occupancy_map_un_padded = copy.deepcopy(h)
            occupancy_map_original = copy.deepcopy(p)

            self.grid = Ant44Env0(
                mj_ant_path=self.config.mj_ant_path,
                re_init=True,
                structure=structure,
                objects=objects,
                obstacle_list=obstacle_list,
                occupancy_map=occupancy_map,
                default_starts=default_starts,
                state_cache=state_cache,
                occupancy_map_padded=occupancy_map_padded,
                occupancy_map_un_padded=occupancy_map_un_padded,
                occupancy_map_original=occupancy_map_original)

            start_pos = self.grid.sample_random_pos(number=1)[0]
            goal_pos = self.grid.sample_random_pos(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            self.episode_steps = 0
            s_init = self.grid.reset(start_pos)

            image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size))
            image[0, 0, :, :] = self.grid.occupancy_map_un_padded
            image[0, 1, :, :] = -1 * np.ones(
                (self.grid.x_size, self.grid.y_size))
            image[0, 1, s_goal[0], s_goal[1]] = 0
            image = torch.from_numpy(image).float().cuda()

            with torch.no_grad():
                v = self.mvprop_optimizer.target_mvprop(image)
                v = v.cpu().detach()

            horizon_left = self.config.time_horizon
            st = start_pos
            success = False

            s_bar = self.grid.lls2hls(st)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                with torch.no_grad():
                    options_x = [
                        s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1,
                        s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1,
                        s_bar[0] - 1
                    ]
                    options_y = [
                        s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1],
                        s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1],
                        s_bar[1] + 1
                    ]
                    options_x = [self.bound_x(e) for e in options_x]
                    options_y = [self.bound_y(e) for e in options_y]
                    v_options = v[0, 0, options_x, options_y]
                    option = np.argmax(v_options)

                if option == 0:
                    target = (s_bar[0], s_bar[1])
                elif option == 1:
                    target = (s_bar[0], s_bar[1] + 1)
                elif option == 2:
                    target = (s_bar[0] + 1, s_bar[1] + 1)
                elif option == 3:
                    target = (s_bar[0] + 1, s_bar[1])
                elif option == 4:
                    target = (s_bar[0] + 1, s_bar[1] - 1)
                elif option == 5:
                    target = (s_bar[0], s_bar[1] - 1)
                elif option == 6:
                    target = (s_bar[0] - 1, s_bar[1] - 1)
                elif option == 7:
                    target = (s_bar[0] - 1, s_bar[1])
                elif option == 8:
                    target = (s_bar[0] - 1, s_bar[1] + 1)
                target = (max(0, min(target[0], self.grid.x_size - 1)),
                          max(0, min(target[1], self.grid.y_size - 1)))

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=target,
                    horizon=self.time_scale,
                    ultimate_goal=goal_pos)

                s_init = l_state

                st = l_state[:2]
                s_bar = self.grid.lls2hls(st)

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

            ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
            tree = ET.parse(ant_path)
            tree.write(self.grid.mj_ant_path + 'ant.xml')

            return success

    def train(self):
        for iter_loop in range(self.config.num_inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)
            print(self.memory.size)
            print(self.eps)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success
class HIROTrainer:
    def __init__(self, iter_num, iter_size, state_dim, action_dim, max_action,
                 batch_size, discount, tau, expl_noise, policy_noise,
                 noise_clip, policy_freq, max_iter, start_time_steps,
                 total_iter, terrain_var, her_var):

        self.iter_num = iter_num
        self.iter_size = iter_size

        kwargs = {
            "state_dim": state_dim,
            "action_dim": action_dim,
            "max_action": max_action,
            "discount": discount,
            "tau": tau,
            "policy_noise": policy_noise * max_action,
            "noise_clip": noise_clip * max_action,
            "policy_freq": policy_freq
        }

        self.action_dim = action_dim

        self.grid = GridWorldContinuous(grid_mode='standard',
                                        terrain=terrain_var)
        self.grid.reset_env_terrain()

        self.manager_policy = TD3(**kwargs)
        self.replay_buffer = utils.ReplayBufferHIRO(state_dim,
                                                    action_dim,
                                                    max_size=int(2e5))

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=[64, 64, 64],
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=5e-4,
                              damping=5e-3,
                              use_fim=False,
                              discount=0.99,
                              imp_weight=False)

        self.manager = Manager()

        self.batch_size = batch_size
        self.expl_noise = expl_noise
        self.expl_noise_start = expl_noise
        self.max_action = max_action

        self.total_steps = 0
        self.max_iter = max_iter
        self.start_time_steps = start_time_steps

        self.manager_time_scale = 2

        self.current_iter = 0
        self.total_iter = total_iter

        self.her_var = her_var

    def roll_out_in_env(self, start, goal, horizon, mode='train'):
        roll_out = Batch()
        done = False
        s = self.grid.reset(start, goal)
        s_list = []
        a_list = []
        r_list = []

        state_seq = []
        action_seq = []

        s_bar = self.grid.lls2hls(s)
        s_manager = copy.deepcopy(
            np.array(
                list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                list(goal)))
        r_manager = 0.
        if mode == 'train':
            if self.total_steps < self.start_time_steps:
                sub_goal = (4 * np.random.random((2, ))) - 2
            else:
                sub_goal = (self.manager_policy.select_action(s_manager) +
                            np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\
                    clip(-self.max_action, self.max_action)
        else:
            sub_goal = self.manager_policy.select_action(s_manager)

        self.manager.update(s=copy.deepcopy(s), sg=sub_goal)

        s_goal = self.grid.lls2hls(goal)

        for step_i in range(horizon):

            if mode == 'train':
                self.total_steps += 1

            s_bar = self.grid.lls2hls(s)
            s_save = copy.deepcopy(
                np.array(
                    list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(self.manager.target(s))))
            s_list.append(s_save)

            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            state_seq.append(s_save)
            action_seq.append(a)

            s_new, _, _, _, _ = self.grid.step(action=10 * a)
            r = self.manager.reward(s_new)
            a_list.append(a)
            r_list.append(r)

            s_new_bar = self.grid.lls2hls(s_new)
            done = ((s_new_bar[0] == s_goal[0])
                    and (s_new_bar[1] == s_goal[1]))
            r_manager += -1 * float(not done)

            manager_update = (step_i + 1) % self.manager_time_scale == 0

            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [
                    r
                ], [s_new.astype(np.float32)], [
                    0 if
                    ((step_i + 1 == horizon) or done or manager_update) else 1
                ], [not done], [1.0]))

            s = s_new

            if manager_update or done:
                self.total_steps += 1
                s_new_manager = copy.deepcopy(
                    np.array(
                        list(s) + list(self.grid.state_cache[s_new_bar[0],
                                                             s_new_bar[1]]) +
                        list(goal)))
                if mode == 'train':
                    self.replay_buffer.add(s_manager, self.manager.sg,
                                           s_new_manager, r_manager, done,
                                           np.array(state_seq),
                                           np.array(action_seq))
                    if self.her_var:
                        s_manager_her = np.concatenate((s_manager[:12], s[:2]))
                        s_new_manager_her = np.concatenate(
                            (s_new_manager[:12], s[:2]))
                        r_manager_her = -len(state_seq) + 1
                        self.replay_buffer.add(s_manager_her, self.manager.sg,
                                               s_new_manager_her,
                                               r_manager_her, True,
                                               np.array(state_seq),
                                               np.array(action_seq))

                    state_seq = []
                    action_seq = []

                s_manager = s_new_manager
                r_manager = 0.

                if mode == 'train':
                    if self.total_steps < self.start_time_steps:
                        sub_goal = (4 * np.random.random((2, ))) - 2
                    else:
                        sub_goal = (self.manager_policy.select_action(s_manager) +
                                    np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \
                            clip(-self.max_action, self.max_action)
                else:
                    sub_goal = self.manager_policy.select_action(s_manager)

                self.manager.update(s=copy.deepcopy(s), sg=sub_goal)

            if done:
                break
        s_save = copy.deepcopy(
            np.array(
                list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                list(self.manager.target(s))))
        s_list.append(s_save)
        return roll_out, step_i + 1, s_list, a_list, r_list, done

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0

        if mode == 'train':
            while num_steps < self.iter_size:
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]

                roll_out, steps, states, actions, rewards, success = self.roll_out_in_env(
                    start=start_pos,
                    goal=goal_pos,
                    horizon=self.max_iter,
                    mode='train')

                if self.total_steps > self.start_time_steps:
                    for _ in range(40):
                        self.manager_policy.train(self.replay_buffer,
                                                  self.batch_size)

                batch.append(roll_out)
                num_roll_outs += 1
                num_steps += steps

                total_success += success

            return batch, total_success / num_roll_outs, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]

            _, steps, states, actions, rewards, success = self.roll_out_in_env(
                start=start_pos,
                goal=goal_pos,
                horizon=self.max_iter,
                mode='test')

        return success

    def train(self):
        for iter_loop in range(self.iter_num):
            self.current_iter += 1
            self.expl_noise = max((1 - (self.current_iter / self.total_iter)) *
                                  self.expl_noise_start, 0)
            batch, g_success, steps_trajectory = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("OC")
            print(self.replay_buffer.size)
            start_t = time.time()
            self.replay_buffer.off_policy_correction(policy=self.policy)
            print(time.time() - start_t)
            print("STATS")
            print(g_success)
            print(steps_trajectory)

    def test(self):
        success = self.simulate_env(mode='test')
        return success
Пример #16
0
class TRPOTrainer:
    def __init__(self,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 use_fim=False,
                 use_gpu=False):

        self.grid = ParkingEmpty()
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)

        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim

    def roll_out_in_env(self, start, horizon):
        roll_out = Batch()
        s = start
        s_list = []
        a_list = []
        r_list = []

        success_var = False
        oob = False
        break_var = False

        for step_i in range(horizon):

            target_vec = s['desired_goal'] - s['observation']
            s_save = copy.deepcopy(
                np.array(
                    list(target_vec[:2]) + list(s['observation'][2:4]) +
                    list(target_vec[2:4])))

            s_list.append(copy.deepcopy(s['observation']))
            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            s_new, r, d, info = self.grid.env.step(a)
            success_var = info["is_success"]
            info = not info["is_success"]

            r = 0.0
            if success_var:
                r = 1.0
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon)

            ib = self.grid.check_in_bounds(s_new['observation'])
            if not ib:
                oob = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new['observation'].astype(np.float32)],
                      [0 if (break_var or oob) else 1], [info], [1.0]))

            s = s_new
            if break_var or oob:
                break

        s_list.append(copy.deepcopy(s['observation']))

        return roll_out, s_list, a_list, r_list, success_var

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                s_init = self.grid.reset()

                roll_out, _, _, _, success = self.roll_out_in_env(
                    start=s_init, horizon=self.max_iter)

                jwp = 1.
                num_roll_outs += 1
                num_steps += roll_out.length()
                batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            s_init = self.grid.reset()

            roll_out, state_list, action_list, reward_list, success = self.roll_out_in_env(
                start=s_init, horizon=self.max_iter)

            num_roll_outs += 1
            num_steps += roll_out.length()
            batch.append(roll_out)

            total_success += success
            j += 1

        return success

    def train(self):
        for iter_loop in range(self.inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success
class TRPOTrainer:

    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 use_fim=False,
                 use_gpu=False,
                 terrain_var=False
                 ):

        self.grid = GridWorldContinuous(grid_mode='standard', terrain=terrain_var)
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim

    def roll_out_in_env(self, start, goal, ultimate_goal, horizon):
        roll_out = Batch()
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        s = self.grid.reset(start, goal)
        s_list = []
        a_list = []
        r_list = []
        d = False

        for step_i in range(horizon):
            s_bar = self.grid.lls2hls(s)
            target_vec = goal - s[:2]
            s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                                            list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, d_wp, info = self.grid.step(action=10*a)

            s_bar_cand = self.grid.lls2hls(s_new)

            break_var = False
            success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1]))

            if success_var:
                info = False
                break_var = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new.astype(np.float32)],
                      [0 if ((not info) or (step_i + 1 == horizon) or break_var) else 1],
                      [info],
                      [1.0]))
            s = s_new
            if (not info) or break_var:
                break
        return roll_out, s_list, a_list, r_list, d, s_new

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:

                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)

                roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos,
                                                                                            goal=goal_pos,
                                                                                            ultimate_goal=goal_pos,
                                                                                            horizon=self.max_iter)

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

                jwp = 1.
                num_roll_outs += 1
                num_steps += roll_out.length()
                batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)

            roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos,
                                                                                        goal=goal_pos,
                                                                                        ultimate_goal=goal_pos,
                                                                                        horizon=self.max_iter)

            st_bar = self.grid.lls2hls(l_state)
            success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

            num_roll_outs += 1
            num_steps += roll_out.length()
            batch.append(roll_out)

            total_success += success
            j += 1

            return success

    def train(self):
        for iter_loop in range(self.inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success
class TRPOTrainer:

    def __init__(self,
                 env_mode,
                 op_mode,
                 state_dim,
                 action_dim,
                 config,
                 save_path=None
                 ):
        self.config = config
        self.save_path = save_path
        self.grid = GridWorldContinuous(grid_mode='custom',
                                        layout=np.load('maze1_layout.npy'),
                                        objects=np.load('maze1_objects.npy'),
                                        waypoint=True,
                                        terrain=self.config.terrain_var)
        self.grid.goal = (13., 13.)
        self.grid.reset_env_terrain()

        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=config.policy_hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=False,
                              max_kl=config.policy_max_kl,
                              damping=config.policy_damp_val,
                              use_fim=False,
                              discount=config.discount_factor,
                              imp_weight=False)

        self.env_mode = env_mode
        self.op_mode = op_mode
        self.state_dim = state_dim
        self.episode_steps = 0

    def roll_out_in_env(self, start, goal, ultimate_goal, horizon, mode='test'):
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        roll_out = Batch()
        break_var = False
        s = start
        d = False

        s_list = []
        a_list = []
        r_list = []

        for step_i in range(horizon):
            self.episode_steps += 1
            s_bar = self.grid.lls2hls(s)

            # GET THE TARGET VECTOR
            target_vec = self.grid.goal_management.get_target_vec(s[:2])
            s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                                            list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, _, info = self.grid.step(action=10*a)
            s_bar_cand = self.grid.lls2hls(s_new)

            success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            break_var_rrt = self.grid.goal_management.update_way_point(self.grid, (s_new[0], s_new[1]), d)
            break_var = break_var or break_var_rrt or (not info) or (step_i + 1 == horizon)

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new.astype(np.float32)],
                      [0 if break_var else 1],
                      [info],
                      [1.0]))
            s = s_new
            if break_var:
                break

        return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:

                """ INITIALIZE THE ENVIRONMENT """
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos, goal_pos)
                self.episode_steps = 0

                path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid)
                self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0]

                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                success = False

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    curr_goal = \
                        self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        horizon=horizon_left,
                        ultimate_goal=goal_pos,
                        mode='train'
                    )

                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

                    batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos, goal_pos)
            self.episode_steps = 0

            path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid)
            self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0]

            """ START THE EPISODE """
            horizon_left = self.config.time_horizon
            success = False

            while (horizon_left > 0) and not success:
                # GET THE TARGET VECTOR
                curr_goal = \
                    self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    horizon=horizon_left,
                    ultimate_goal=goal_pos,
                    mode='test'
                )

                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

            return success

    def train(self):
        for iter_loop in range(self.config.num_inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success
Пример #19
0
class TRPOTrainer:
    def __init__(self,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 optimistic_model,
                 use_fim=False,
                 use_gpu=False):

        self.grid = ParkingEmpty()
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)
        self.buffer = Buffer()
        gx = self.grid.x_size
        gy = self.grid.y_size
        self.vi = ValueIteration33SyncNN(grid_size_x=gx,
                                         grid_size_y=gy,
                                         gamma=self.optimizer.discount)

        self.op_mode = op_mode
        self.optimistic_model = optimistic_model
        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0

    def roll_out_in_env(self, start, horizon):
        roll_out = Batch()
        s = start
        s_list = []
        a_list = []
        r_list = []

        d = False
        oob = False
        break_var = False
        s_u_goal = self.grid.lls2hls(s['observation'])
        target_vec = self.vi.get_target_vec(s['observation'])
        goal = self.grid.lls2hls(
            s['observation'] +
            np.array(list(target_vec[:2]) + [0, 0] + list(target_vec[2:4])))

        for step_i in range(horizon):
            self.episode_steps += 1

            target_vec = self.vi.get_target_vec(s['observation'])
            s_save = copy.deepcopy(
                np.array(
                    list(target_vec[:2]) + list(s['observation'][2:4]) +
                    list(target_vec[2:4])))

            s_list.append(copy.deepcopy(s['observation']))
            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            s_new, r, d, info = self.grid.env.step(a)
            s_new_bar = self.grid.lls2hls(s_new['observation'])
            success_var = info["is_success"]
            info = not info["is_success"]

            d = (s_new_bar[0] == goal[0]) and (s_new_bar[1]
                                               == goal[1]) and (s_new_bar[2]
                                                                == goal[2])
            r = 0.0
            if d:
                r = 1.0
                info = False

            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (
                self.episode_steps == self.max_iter)

            ib = self.grid.check_in_bounds(s_new['observation'])
            if not ib:
                oob = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new['observation'].astype(np.float32)],
                      [0 if (break_var or oob) else 1], [info], [1.0]))

            s = s_new
            if break_var or oob:
                break

        s_list.append(copy.deepcopy(s['observation']))

        return roll_out, s_list, a_list, r_list, d, success_var, s_new, s_new_bar, oob

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                s_init = self.grid.reset()
                s_start = self.grid.lls2hls(s_init['observation'])
                s_goal = self.grid.lls2hls(s_init['desired_goal'])
                self.episode_steps = 0
                """ V MAP """
                if self.optimistic_model:
                    self.vi.update_p_table_optimistic(
                        occupancy_map=self.grid.occupancy_map, walls=False)
                else:
                    self.vi.update_p_table(
                        occupancy_map=self.grid.occupancy_map, walls=True)

                v, pi = self.vi.run_vi(grid=self.grid,
                                       goal=(s_goal[0], s_goal[1], s_goal[2]))
                """ START THE EPISODE """
                horizon_left = self.max_iter
                success = False

                hl_s_list = []
                hl_s_new_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(
                        -1. * self.dqn_steps / 10000)

                    s_bar = self.grid.lls2hls(s_init['observation'])
                    hl_s_list.append(s_bar)

                    if torch.rand(1)[0] > self.eps:
                        a_bar = pi[s_bar[0], s_bar[1], s_bar[2]]
                    else:
                        a_bar = (randint(0, 7), randint(0, 7))

                    self.vi.set_target(s_bar, a_bar)

                    roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                        horizon=self.time_scale, start=s_init)

                    hl_s_new_list.append(s_bar_p)
                    hl_a_list.append(a_bar)

                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                    if oob:
                        break

                total_success += success
                j += 1

                if not self.optimistic_model:
                    x_temp, y_temp = self.vi.generate_dataset_flat(
                        hl_s_list, hl_a_list, hl_s_new_list)

                    for bi in range(x_temp.shape[0]):
                        self.buffer.add(x_temp[bi], y_temp[bi])

                    self.vi.train_net(buffer=self.buffer,
                                      bs=128,
                                      opt_iterations=40)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            s_init = self.grid.reset()
            s_start = self.grid.lls2hls(s_init['observation'])
            s_goal = self.grid.lls2hls(s_init['desired_goal'])
            self.episode_steps = 0
            """ V MAP """
            if self.optimistic_model:
                self.vi.update_p_table_optimistic(
                    occupancy_map=self.grid.occupancy_map, walls=False)
            else:
                self.vi.update_p_table(occupancy_map=self.grid.occupancy_map,
                                       walls=True)

            v, pi = self.vi.run_vi(grid=self.grid,
                                   goal=(s_goal[0], s_goal[1], s_goal[2]))
            """ START THE EPISODE """
            horizon_left = self.max_iter
            success = False

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                s_bar = self.grid.lls2hls(s_init['observation'])
                a_bar = pi[s_bar[0], s_bar[1], s_bar[2]]

                self.vi.set_target(s_bar, a_bar)

                roll_out, states, actions, rewards, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                    horizon=self.time_scale, start=s_init)

                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                if oob:
                    break

            return success

    def train(self):
        for iter_loop in range(self.inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(
                mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success
class TRPOTrainer:

    def __init__(self,
                 op_mode,
                 state_dim,
                 action_dim,
                 hidden_sizes,
                 max_kl,
                 damping,
                 batch_size,
                 inner_episodes,
                 max_iter,
                 use_fim=False,
                 use_gpu=False
                 ):

        self.grid = ParkingEmpty()
        self.policy = ContinuousMLP(state_dim,
                                    action_dim,
                                    hidden_sizes=hidden_sizes,
                                    activation=F.relu,
                                    is_disc_action=False)
        self.optimizer = TRPO(policy=self.policy,
                              use_gpu=use_gpu,
                              max_kl=max_kl,
                              damping=damping,
                              use_fim=use_fim,
                              discount=0.99,
                              imp_weight=False)
        self.mvprop = MVPROPFAT3D(k=100).cuda()
        self.mvprop_target = copy.deepcopy(self.mvprop).cuda()
        self.memory = ReplayBufferVIN3D(3, 1, self.grid.x_size, 35000)
        self.mvprop_optimizer = MVPROPOptimizer3D(self.mvprop,
                                                  self.mvprop_target,
                                                  self.memory,
                                                  0.99,
                                                  128,
                                                  3e-4,
                                                  100)

        self.op_mode = op_mode

        self.batch_size = batch_size
        self.inner_episodes = inner_episodes
        self.max_iter = max_iter
        self.state_dim = state_dim
        self.dqn_steps = 0
        self.eps = 1.0
        self.time_scale = 2
        self.episode_steps = 0

    def bound_x(self, input_var):
        return max(0, min(input_var, self.grid.x_size - 1))

    def bound_y(self, input_var):
        return max(0, min(input_var, self.grid.y_size - 1))

    def roll_out_in_env(self, start, target, horizon):
        roll_out = Batch()
        s = start
        s_list = []
        a_list = []
        r_list = []

        d = False
        oob = False
        break_var = False
        target_vec = target_2_target_vec(target, s['observation'])
        goal = self.grid.lls2hls(s['observation'] + np.array(list(target_vec[:2]) + [0, 0] + list(target_vec[2:4])))

        for step_i in range(horizon):
            self.episode_steps += 1

            target_vec = target_2_target_vec(target, s['observation'])
            s_save = copy.deepcopy(np.array(list(target_vec[:2]) + list(s['observation'][2:4]) + list(target_vec[2:4])))

            s_list.append(copy.deepcopy(s['observation']))
            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            s_new, r, d, info = self.grid.env.step(a)
            s_new_bar = self.grid.lls2hls(s_new['observation'])
            success_var = info["is_success"]
            info = not info["is_success"]

            d = (s_new_bar[0] == goal[0]) and (s_new_bar[1] == goal[1]) and (s_new_bar[2] == goal[2])
            r = 0.0
            if d:
                r = 1.0
                info = False

            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (self.episode_steps == self.max_iter)

            ib = self.grid.check_in_bounds(s_new['observation'])
            if not ib:
                oob = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new['observation'].astype(np.float32)],
                      [0 if (break_var or oob) else 1],
                      [info],
                      [1.0]))

            s = s_new
            if break_var or oob:
                break

        s_list.append(copy.deepcopy(s['observation']))

        return roll_out, s_list, a_list, r_list, d, success_var, s_new, s_new_bar, oob

    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:

                """ INITIALIZE THE ENVIRONMENT """
                s_init = self.grid.reset()
                s_start = self.grid.lls2hls(s_init['observation'])
                s_goal = self.grid.lls2hls(s_init['desired_goal'])
                self.episode_steps = 0

                """ IMAGE INPUT """
                image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8))
                image[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8))
                image[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8))
                image[0, 1, s_goal[0], s_goal[1], s_goal[2]] = 0
                image = torch.from_numpy(image).float().cuda()

                with torch.no_grad():
                    v = self.mvprop_optimizer.mvprop(image)
                    v = v.cpu().detach()

                """ START THE EPISODE """
                horizon_left = self.max_iter
                success = False

                s_bar = self.grid.lls2hls(s_init['observation'])
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(-1. * self.dqn_steps / 10000)
                    s_bar = self.grid.lls2hls(s_init['observation'])
                    hl_s_list.append(s_bar)

                    if torch.rand(1)[0] > self.eps:
                        with torch.no_grad():
                            options_x = [s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                         s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                         s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                         s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1]
                            options_y = [s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                         s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                         s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                         s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1]
                            options_z = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
                                         2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
                                         5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7]
                            options_x = [self.bound_x(e) for e in options_x]
                            options_y = [self.bound_y(e) for e in options_y]
                            v_options = v[0, 0, options_x, options_y, options_z]
                            option = np.argmax(v_options)
                    else:
                        option = randint(0, 71)

                    option_o = np.floor(option / 9)
                    option_p = option % 9

                    if option_p == 0:
                        target_p = (s_bar[0], s_bar[1])
                    elif option_p == 1:
                        target_p = (s_bar[0], s_bar[1] + 1)
                    elif option_p == 2:
                        target_p = (s_bar[0] + 1, s_bar[1] + 1)
                    elif option_p == 3:
                        target_p = (s_bar[0] + 1, s_bar[1])
                    elif option_p == 4:
                        target_p = (s_bar[0] + 1, s_bar[1] - 1)
                    elif option_p == 5:
                        target_p = (s_bar[0], s_bar[1] - 1)
                    elif option_p == 6:
                        target_p = (s_bar[0] - 1, s_bar[1] - 1)
                    elif option_p == 7:
                        target_p = (s_bar[0] - 1, s_bar[1])
                    elif option_p == 8:
                        target_p = (s_bar[0] - 1, s_bar[1] + 1)
                    target_p = (max(0, min(target_p[0], self.grid.x_size - 1)),
                                max(0, min(target_p[1], self.grid.y_size - 1)))
                    target = (target_p[0], target_p[1], int(option_o))

                    roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                        horizon=self.time_scale,
                        start=s_init,
                        target=target)

                    s_init = l_state

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(option)

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state['observation'])
                    success_tile = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]) and (st_bar[2] == s_goal[2]))
                    if success_tile:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                    if oob:
                        break

                total_success += success
                j += 1

                ### ADD TRANSITIONS TO BUFFER
                for ep_idx in range(len(hl_a_list)):

                    self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hl_r_list[ep_idx],
                                    hl_d_list[ep_idx], image)

                    if True:
                        ##### GET THE HINDSIGHT GOAL TRANSITION
                        image_her = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8))
                        image_her[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8))
                        image_her[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8))
                        image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1], hl_s_list[-1][2]] = 0
                        image_her = torch.from_numpy(image_her).float().cuda()

                        if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and \
                                (hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]) and \
                                (hl_s_list[ep_idx + 1][2] == hl_s_list[-1][2]):
                            hgt_reward = 0
                            hgt_done = True
                        else:
                            hgt_reward = -1
                            hgt_done = False

                        self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hgt_reward,
                                        hgt_done, image_her)

                ### OPTIMIZE NETWORK PARAMETERS
                for _ in range(40):
                    self.mvprop_optimizer.train(self.max_iter / self.time_scale)

                # TARGET NET UPDATE
                if self.dqn_steps % 1 == 0:
                    tau = 0.05
                    for param, target_param in zip(self.mvprop_optimizer.mvprop.parameters(),
                                                   self.mvprop_optimizer.target_mvprop.parameters()):
                        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:

            """ INITIALIZE THE ENVIRONMENT """
            s_init = self.grid.reset()
            s_start = self.grid.lls2hls(s_init['observation'])
            s_goal = self.grid.lls2hls(s_init['desired_goal'])
            self.episode_steps = 0

            """ IMAGE INPUT """
            image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8))
            image[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8))
            image[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8))
            image[0, 1, s_goal[0], s_goal[1], s_goal[2]] = 0
            image = torch.from_numpy(image).float().cuda()

            with torch.no_grad():
                v = self.mvprop_optimizer.target_mvprop(image)
                v = v.cpu().detach()

            """ START THE EPISODE """
            horizon_left = self.max_iter
            success = False

            s_bar = self.grid.lls2hls(s_init['observation'])
            hl_s_list = []
            hl_a_list = []
            hl_r_list = []
            hl_d_list = []
            hl_s_list.append(s_bar)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                s_bar = self.grid.lls2hls(s_init['observation'])
                hl_s_list.append(s_bar)

                with torch.no_grad():
                    options_x = [s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                 s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                 s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                 s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1]
                    options_y = [s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                 s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                 s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                 s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1]
                    options_z = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
                                 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
                                 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7]
                    options_x = [self.bound_x(e) for e in options_x]
                    options_y = [self.bound_y(e) for e in options_y]
                    v_options = v[0, 0, options_x, options_y, options_z]
                    option = np.argmax(v_options)

                option_o = np.floor(option / 9)
                option_p = option % 9

                if option_p == 0:
                    target_p = (s_bar[0], s_bar[1])
                elif option_p == 1:
                    target_p = (s_bar[0], s_bar[1] + 1)
                elif option_p == 2:
                    target_p = (s_bar[0] + 1, s_bar[1] + 1)
                elif option_p == 3:
                    target_p = (s_bar[0] + 1, s_bar[1])
                elif option_p == 4:
                    target_p = (s_bar[0] + 1, s_bar[1] - 1)
                elif option_p == 5:
                    target_p = (s_bar[0], s_bar[1] - 1)
                elif option_p == 6:
                    target_p = (s_bar[0] - 1, s_bar[1] - 1)
                elif option_p == 7:
                    target_p = (s_bar[0] - 1, s_bar[1])
                elif option_p == 8:
                    target_p = (s_bar[0] - 1, s_bar[1] + 1)
                target_p = (max(0, min(target_p[0], self.grid.x_size - 1)),
                            max(0, min(target_p[1], self.grid.y_size - 1)))
                target = (target_p[0], target_p[1], int(option_o))

                roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                    horizon=self.time_scale,
                    start=s_init,
                    target=target)

                s_init = l_state

                hl_s_list.append(s_bar_p)
                hl_a_list.append(option)

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state['observation'])
                success_tile = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]) and (st_bar[2] == s_goal[2]))
                if success_tile:
                    hl_r_list.append(0)
                    hl_d_list.append(True)
                else:
                    hl_r_list.append(-1)
                    hl_d_list.append(False)

                if oob:
                    break

            j = 1.

            return success

    def train(self):
        for iter_loop in range(self.inner_episodes):
            batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(mode='train')
            self.optimizer.process_batch(self.policy, batch, [])
            print("G_S, WP_S, G_ST, WP_ST")
            print(g_success)
            print(wp_success)
            print(steps_trajectory)
            print(steps_wp)

    def test(self):
        g_success = self.simulate_env(mode='test')
        return g_success