class TRPOTrainer: def __init__(self, env_mode, op_mode, state_dim, action_dim, config, save_path=None): self.config = config self.save_path = save_path self.grid = GridWorldContinuous(grid_mode='custom', layout=np.load('maze1_layout.npy'), objects=np.load('maze1_objects.npy'), terrain=self.config.terrain_var) self.grid.goal = (13., 13.) self.grid.reset_env_terrain() self.policy = ContinuousMLP(state_dim, action_dim, hidden_sizes=config.policy_hidden_sizes, activation=F.relu, is_disc_action=False) self.optimizer = TRPO(policy=self.policy, use_gpu=False, max_kl=config.policy_max_kl, damping=config.policy_damp_val, use_fim=False, discount=config.discount_factor, imp_weight=False) self.buffer = Buffer() self.vi = ValueIteration4M1SyncNN8(grid_size_x=21, grid_size_y=21, gamma=self.optimizer.discount) self.env_mode = env_mode self.op_mode = op_mode self.state_dim = state_dim self.dqn_steps = 0 self.eps = 1.0 self.time_scale = 2 self.episode_steps = 0 def bound_x(self, input_var): return max(0, min(input_var, self.grid.x_size - 1)) def bound_y(self, input_var): return max(0, min(input_var, self.grid.y_size - 1)) def roll_out_in_env(self, start, goal, ultimate_goal, horizon, mode='test'): s_u_goal = self.grid.lls2hls(ultimate_goal) roll_out = Batch() break_var = False s = start d = False s_list = [] a_list = [] r_list = [] for step_i in range(horizon): self.episode_steps += 1 s_bar = self.grid.lls2hls(s) # GET THE TARGET VECTOR target_vec = (goal[0] - s[0], goal[1] - s[1]) s_save = copy.deepcopy( np.array( list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(target_vec))) s_pos_save = copy.deepcopy(np.array(s[:2])) s_list.append(np.concatenate((s_pos_save, s_save))) s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s) s_new, r, d, _, info = self.grid.step(action=10 * a) s_bar_cand = self.grid.lls2hls(s_new) d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1]) if d: info = False r = 1.0 success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1])) if success_var: info = False break_var = True break_var = break_var or (not info) or (step_i + 1 == horizon) or ( self.episode_steps == self.config.time_horizon) a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new.astype(np.float32)], [0 if break_var else 1], [info], [1.0])) s = s_new if break_var: break return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.config.policy_batch_size: """ INITIALIZE THE ENVIRONMENT """ self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 """ V MAP """ if self.config.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=True) else: self.vi.update_p_table(occupancy_map=self.grid.terrain_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1])) """ START THE EPISODE """ horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) hl_s_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] hl_s_list.append(s_bar) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR self.dqn_steps += 1 self.eps = 0.01 + 0.99 * math.exp( -1. * self.dqn_steps / 10000) s_bar = self.grid.lls2hls(st) if torch.rand(1)[0] > self.eps: a_bar = int(pi[s_bar[0], s_bar[1]]) else: a_bar = randint(0, 7) self.vi.set_target(s_bar, a_bar) curr_goal = self.vi.get_target() roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=self.time_scale, ultimate_goal=goal_pos, mode='train') hl_s_list.append(s_bar_p) hl_a_list.append(a_bar) st = l_state[:2] s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) if success: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) batch.append(roll_out) total_success += success j += 1 if not self.config.optimistic_model: x_temp, y_temp, w_temp = self.vi.generate_dataset_flat( self.grid.terrain_map, hl_s_list, hl_a_list) for bi in range(x_temp.shape[0]): self.buffer.add(x_temp[bi], y_temp[bi], w_temp[bi]) self.vi.train_net(buffer=self.buffer, bs=128, opt_iterations=40, rw=True) return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 """ V MAP """ if self.config.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=True) else: self.vi.update_p_table(occupancy_map=self.grid.terrain_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1])) horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR a_bar = int(pi[s_bar[0], s_bar[1]]) self.vi.set_target(s_bar, a_bar) curr_goal = self.vi.get_target() roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=self.time_scale, ultimate_goal=goal_pos, mode='test') st = l_state[:2] s_bar = self.grid.lls2hls(st) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) return success def train(self): for iter_loop in range(self.config.num_inner_episodes): batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env( mode='train') self.optimizer.process_batch(self.policy, batch, []) print("G_S, WP_S, G_ST, WP_ST") print(g_success) print(wp_success) print(steps_trajectory) print(steps_wp) print(self.buffer.size) print(self.eps) def test(self): g_success = self.simulate_env(mode='test') return g_success
class TRPOTrainer: def __init__(self, env_mode, op_mode, state_dim, action_dim, hidden_sizes, max_kl, damping, batch_size, inner_episodes, max_iter, use_fim=False, use_gpu=False, terrain_var=False ): self.grid = GridWorldContinuous(grid_mode='standard', terrain=terrain_var) self.policy = ContinuousMLP(state_dim, action_dim, hidden_sizes=hidden_sizes, activation=F.relu, is_disc_action=False) self.optimizer = TRPO(policy=self.policy, use_gpu=use_gpu, max_kl=max_kl, damping=damping, use_fim=use_fim, discount=0.99, imp_weight=False) self.env_mode = env_mode self.op_mode = op_mode self.batch_size = batch_size self.inner_episodes = inner_episodes self.max_iter = max_iter self.state_dim = state_dim def roll_out_in_env(self, start, goal, ultimate_goal, horizon): roll_out = Batch() s_u_goal = self.grid.lls2hls(ultimate_goal) s = self.grid.reset(start, goal) s_list = [] a_list = [] r_list = [] d = False for step_i in range(horizon): s_bar = self.grid.lls2hls(s) target_vec = goal - s[:2] s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(target_vec))) s_pos_save = copy.deepcopy(np.array(s[:2])) s_list.append(np.concatenate((s_pos_save, s_save))) s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s) s_new, r, d, d_wp, info = self.grid.step(action=10*a) s_bar_cand = self.grid.lls2hls(s_new) break_var = False success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1])) if success_var: info = False break_var = True a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new.astype(np.float32)], [0 if ((not info) or (step_i + 1 == horizon) or break_var) else 1], [info], [1.0])) s = s_new if (not info) or break_var: break return roll_out, s_list, a_list, r_list, d, s_new def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.batch_size: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos, goal=goal_pos, ultimate_goal=goal_pos, horizon=self.max_iter) st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) jwp = 1. num_roll_outs += 1 num_steps += roll_out.length() batch.append(roll_out) total_success += success j += 1 return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos, goal=goal_pos, ultimate_goal=goal_pos, horizon=self.max_iter) st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) num_roll_outs += 1 num_steps += roll_out.length() batch.append(roll_out) total_success += success j += 1 return success def train(self): for iter_loop in range(self.inner_episodes): batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(mode='train') self.optimizer.process_batch(self.policy, batch, []) print(g_success) print(wp_success) print(steps_trajectory) print(steps_wp) def test(self): g_success = self.simulate_env(mode='test') return g_success
class HIROTrainer: def __init__(self, iter_num, iter_size, state_dim, action_dim, max_action, batch_size, discount, tau, expl_noise, policy_noise, noise_clip, policy_freq, max_iter, start_time_steps, total_iter, terrain_var, her_var): self.iter_num = iter_num self.iter_size = iter_size kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": discount, "tau": tau, "policy_noise": policy_noise * max_action, "noise_clip": noise_clip * max_action, "policy_freq": policy_freq } self.action_dim = action_dim self.grid = GridWorldContinuous(grid_mode='standard', terrain=terrain_var) self.grid.reset_env_terrain() self.manager_policy = TD3(**kwargs) self.replay_buffer = utils.ReplayBufferHIRO(state_dim, action_dim, max_size=int(2e5)) self.policy = ContinuousMLP(state_dim, action_dim, hidden_sizes=[64, 64, 64], activation=F.relu, is_disc_action=False) self.optimizer = TRPO(policy=self.policy, use_gpu=False, max_kl=5e-4, damping=5e-3, use_fim=False, discount=0.99, imp_weight=False) self.manager = Manager() self.batch_size = batch_size self.expl_noise = expl_noise self.expl_noise_start = expl_noise self.max_action = max_action self.total_steps = 0 self.max_iter = max_iter self.start_time_steps = start_time_steps self.manager_time_scale = 2 self.current_iter = 0 self.total_iter = total_iter self.her_var = her_var def roll_out_in_env(self, start, goal, horizon, mode='train'): roll_out = Batch() done = False s = self.grid.reset(start, goal) s_list = [] a_list = [] r_list = [] state_seq = [] action_seq = [] s_bar = self.grid.lls2hls(s) s_manager = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(goal))) r_manager = 0. if mode == 'train': if self.total_steps < self.start_time_steps: sub_goal = (4 * np.random.random((2, ))) - 2 else: sub_goal = (self.manager_policy.select_action(s_manager) + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\ clip(-self.max_action, self.max_action) else: sub_goal = self.manager_policy.select_action(s_manager) self.manager.update(s=copy.deepcopy(s), sg=sub_goal) s_goal = self.grid.lls2hls(goal) for step_i in range(horizon): if mode == 'train': self.total_steps += 1 s_bar = self.grid.lls2hls(s) s_save = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(self.manager.target(s)))) s_list.append(s_save) s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s_tensor) state_seq.append(s_save) action_seq.append(a) s_new, _, _, _, _ = self.grid.step(action=10 * a) r = self.manager.reward(s_new) a_list.append(a) r_list.append(r) s_new_bar = self.grid.lls2hls(s_new) done = ((s_new_bar[0] == s_goal[0]) and (s_new_bar[1] == s_goal[1])) r_manager += -1 * float(not done) manager_update = (step_i + 1) % self.manager_time_scale == 0 roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [ r ], [s_new.astype(np.float32)], [ 0 if ((step_i + 1 == horizon) or done or manager_update) else 1 ], [not done], [1.0])) s = s_new if manager_update or done: self.total_steps += 1 s_new_manager = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_new_bar[0], s_new_bar[1]]) + list(goal))) if mode == 'train': self.replay_buffer.add(s_manager, self.manager.sg, s_new_manager, r_manager, done, np.array(state_seq), np.array(action_seq)) if self.her_var: s_manager_her = np.concatenate((s_manager[:12], s[:2])) s_new_manager_her = np.concatenate( (s_new_manager[:12], s[:2])) r_manager_her = -len(state_seq) + 1 self.replay_buffer.add(s_manager_her, self.manager.sg, s_new_manager_her, r_manager_her, True, np.array(state_seq), np.array(action_seq)) state_seq = [] action_seq = [] s_manager = s_new_manager r_manager = 0. if mode == 'train': if self.total_steps < self.start_time_steps: sub_goal = (4 * np.random.random((2, ))) - 2 else: sub_goal = (self.manager_policy.select_action(s_manager) + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \ clip(-self.max_action, self.max_action) else: sub_goal = self.manager_policy.select_action(s_manager) self.manager.update(s=copy.deepcopy(s), sg=sub_goal) if done: break s_save = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(self.manager.target(s)))) s_list.append(s_save) return roll_out, step_i + 1, s_list, a_list, r_list, done def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 if mode == 'train': while num_steps < self.iter_size: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] roll_out, steps, states, actions, rewards, success = self.roll_out_in_env( start=start_pos, goal=goal_pos, horizon=self.max_iter, mode='train') if self.total_steps > self.start_time_steps: for _ in range(40): self.manager_policy.train(self.replay_buffer, self.batch_size) batch.append(roll_out) num_roll_outs += 1 num_steps += steps total_success += success return batch, total_success / num_roll_outs, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] _, steps, states, actions, rewards, success = self.roll_out_in_env( start=start_pos, goal=goal_pos, horizon=self.max_iter, mode='test') return success def train(self): for iter_loop in range(self.iter_num): self.current_iter += 1 self.expl_noise = max((1 - (self.current_iter / self.total_iter)) * self.expl_noise_start, 0) batch, g_success, steps_trajectory = self.simulate_env( mode='train') self.optimizer.process_batch(self.policy, batch, []) print("OC") print(self.replay_buffer.size) start_t = time.time() self.replay_buffer.off_policy_correction(policy=self.policy) print(time.time() - start_t) print("STATS") print(g_success) print(steps_trajectory) def test(self): success = self.simulate_env(mode='test') return success
class TRPOTrainer: def __init__(self, env_mode, op_mode, state_dim, action_dim, config, save_path=None ): self.config = config self.save_path = save_path self.grid = GridWorldContinuous(grid_mode='custom', layout=np.load('maze1_layout.npy'), objects=np.load('maze1_objects.npy'), waypoint=True, terrain=self.config.terrain_var) self.grid.goal = (13., 13.) self.grid.reset_env_terrain() self.policy = ContinuousMLP(state_dim, action_dim, hidden_sizes=config.policy_hidden_sizes, activation=F.relu, is_disc_action=False) self.optimizer = TRPO(policy=self.policy, use_gpu=False, max_kl=config.policy_max_kl, damping=config.policy_damp_val, use_fim=False, discount=config.discount_factor, imp_weight=False) self.env_mode = env_mode self.op_mode = op_mode self.state_dim = state_dim self.episode_steps = 0 def roll_out_in_env(self, start, goal, ultimate_goal, horizon, mode='test'): s_u_goal = self.grid.lls2hls(ultimate_goal) roll_out = Batch() break_var = False s = start d = False s_list = [] a_list = [] r_list = [] for step_i in range(horizon): self.episode_steps += 1 s_bar = self.grid.lls2hls(s) # GET THE TARGET VECTOR target_vec = self.grid.goal_management.get_target_vec(s[:2]) s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(target_vec))) s_pos_save = copy.deepcopy(np.array(s[:2])) s_list.append(np.concatenate((s_pos_save, s_save))) s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s) s_new, r, d, _, info = self.grid.step(action=10*a) s_bar_cand = self.grid.lls2hls(s_new) success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1])) if success_var: info = False break_var = True break_var_rrt = self.grid.goal_management.update_way_point(self.grid, (s_new[0], s_new[1]), d) break_var = break_var or break_var_rrt or (not info) or (step_i + 1 == horizon) a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new.astype(np.float32)], [0 if break_var else 1], [info], [1.0])) s = s_new if break_var: break return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.config.policy_batch_size: """ INITIALIZE THE ENVIRONMENT """ self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid) self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0] """ START THE EPISODE """ horizon_left = self.config.time_horizon success = False while (horizon_left > 0) and not success: # GET THE TARGET VECTOR curr_goal = \ self.grid.goal_management.way_points[self.grid.goal_management.way_point_current] roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=horizon_left, ultimate_goal=goal_pos, mode='train' ) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) batch.append(roll_out) total_success += success j += 1 return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid) self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0] """ START THE EPISODE """ horizon_left = self.config.time_horizon success = False while (horizon_left > 0) and not success: # GET THE TARGET VECTOR curr_goal = \ self.grid.goal_management.way_points[self.grid.goal_management.way_point_current] roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=horizon_left, ultimate_goal=goal_pos, mode='test' ) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) return success def train(self): for iter_loop in range(self.config.num_inner_episodes): batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env(mode='train') self.optimizer.process_batch(self.policy, batch, []) print("G_S, WP_S, G_ST, WP_ST") print(g_success) print(wp_success) print(steps_trajectory) print(steps_wp) def test(self): g_success = self.simulate_env(mode='test') return g_success
class TRPOTrainer: def __init__(self, env_mode, op_mode, state_dim, action_dim, config, save_path=None): self.config = config self.save_path = save_path self.grid = GridWorldContinuous(grid_mode='custom', layout=np.load('maze1_layout.npy'), objects=np.load('maze1_objects.npy'), terrain=self.config.terrain_var) self.grid.reset_env_terrain() self.policy = ContinuousMLP(state_dim, action_dim, hidden_sizes=config.policy_hidden_sizes, activation=F.relu, is_disc_action=False) self.optimizer = TRPO(policy=self.policy, use_gpu=False, max_kl=config.policy_max_kl, damping=config.policy_damp_val, use_fim=False, discount=config.discount_factor, imp_weight=False) self.mvprop = MVPROPFAT(k=config.mvprop_k).cuda() self.mvprop_target = copy.deepcopy(self.mvprop).cuda() self.memory = ReplayBufferVIN(2, 1, self.grid.x_size, config.mvprop_buffer_size) self.mvprop_optimizer = MVPROPOptimizerTD0( self.mvprop, self.mvprop_target, self.memory, config.discount_factor, config.mvprop_batch_size, config.mvprop_lr, config.mvprop_k) self.env_mode = env_mode self.op_mode = op_mode self.state_dim = state_dim self.dqn_steps = 0 self.eps = 1.0 self.time_scale = 2 self.episode_steps = 0 def bound_x(self, input_var): return max(0, min(input_var, self.grid.x_size - 1)) def bound_y(self, input_var): return max(0, min(input_var, self.grid.y_size - 1)) def roll_out_in_env(self, start, goal, ultimate_goal, horizon, mode='test'): s_u_goal = self.grid.lls2hls(ultimate_goal) roll_out = Batch() break_var = False s = start d = False s_list = [] a_list = [] r_list = [] for step_i in range(horizon): self.episode_steps += 1 s_bar = self.grid.lls2hls(s) # GET THE TARGET VECTOR target_vec = (goal[0] - s[0], goal[1] - s[1]) s_save = copy.deepcopy( np.array( list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(target_vec))) s_pos_save = copy.deepcopy(np.array(s[:2])) s_list.append(np.concatenate((s_pos_save, s_save))) s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s) s_new, r, d, d_wp, info = self.grid.step(action=10 * a) s_bar_cand = self.grid.lls2hls(s_new) d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1]) if d: info = False r = 1.0 success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1])) if success_var: info = False break_var = True break_var = break_var or (not info) or (step_i + 1 == horizon) or ( self.episode_steps == self.config.time_horizon) a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new.astype(np.float32)], [0 if break_var else 1], [info], [1.0])) s = s_new if break_var: break return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.config.policy_batch_size: """ INITIALIZE THE ENVIRONMENT """ self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 """ IMAGE INPUT """ image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size)) image[0, 0, :, :] = self.grid.terrain_map_un_padded image[0, 1, :, :] = -1 * np.ones( (self.grid.x_size, self.grid.y_size)) image[0, 1, s_goal[0], s_goal[1]] = 0 image = torch.from_numpy(image).float().cuda() with torch.no_grad(): v = self.mvprop_optimizer.mvprop(image) v = v.cpu().detach() """ START THE EPISODE """ horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) hl_s_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] hl_s_list.append(s_bar) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR self.dqn_steps += 1 if self.config.mvprop_decay_type == 'exp': self.eps = 0.01 + 0.99 * math.exp( -1. * self.dqn_steps / self.config.mvprop_decay) if torch.rand(1)[0] > self.eps: with torch.no_grad(): options_x = [ s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1 ] options_y = [ s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1 ] options_x = [self.bound_x(e) for e in options_x] options_y = [self.bound_y(e) for e in options_y] v_options = v[0, 0, options_x, options_y] option = np.argmax(v_options) else: option = randint(0, 8) if option == 0: target = (s_bar[0], s_bar[1]) elif option == 1: target = (s_bar[0], s_bar[1] + 1) elif option == 2: target = (s_bar[0] + 1, s_bar[1] + 1) elif option == 3: target = (s_bar[0] + 1, s_bar[1]) elif option == 4: target = (s_bar[0] + 1, s_bar[1] - 1) elif option == 5: target = (s_bar[0], s_bar[1] - 1) elif option == 6: target = (s_bar[0] - 1, s_bar[1] - 1) elif option == 7: target = (s_bar[0] - 1, s_bar[1]) elif option == 8: target = (s_bar[0] - 1, s_bar[1] + 1) target = (max(0, min(target[0], self.grid.x_size - 1)), max(0, min(target[1], self.grid.y_size - 1))) roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env( start=s_init, goal=target, horizon=self.time_scale, ultimate_goal=goal_pos, mode='train') s_init = l_state hl_s_list.append(s_bar_p) hl_a_list.append(option) st = l_state[:2] s_bar = self.grid.lls2hls(st) num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) if success: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) batch.append(roll_out) total_success += success j += 1 ### ADD TRANSITIONS TO BUFFER for ep_idx in range(len(hl_a_list)): self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hl_r_list[ep_idx], hl_d_list[ep_idx], image) if self.config.mvprop_her: ### GET THE HINDSIGHT GOAL TRANSITION image_her = np.zeros( (1, 2, self.grid.x_size, self.grid.y_size)) image_her[0, 0, :, :] = self.grid.terrain_map_un_padded image_her[0, 1, :, :] = -1 * np.ones( (self.grid.x_size, self.grid.y_size)) image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1]] = 0 image_her = torch.from_numpy(image_her).float().cuda() if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and ( hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]): hgt_reward = 0 hgt_done = True else: hgt_reward = -1 hgt_done = False self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hgt_reward, hgt_done, image_her) ### OPTIMIZE NETWORK PARAMETERS for _ in range(40): self.mvprop_optimizer.train(self.config.time_horizon / self.time_scale) # TARGET NET UPDATE if self.dqn_steps % self.config.mvprop_target_update_frequency == 0: tau = 0.05 for param, target_param in zip( self.mvprop_optimizer.mvprop.parameters(), self.mvprop_optimizer.target_mvprop.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size)) image[0, 0, :, :] = self.grid.terrain_map_un_padded image[0, 1, :, :] = -1 * np.ones( (self.grid.x_size, self.grid.y_size)) image[0, 1, s_goal[0], s_goal[1]] = 0 image = torch.from_numpy(image).float().cuda() with torch.no_grad(): v = self.mvprop_optimizer.target_mvprop(image) v = v.cpu().detach() horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR with torch.no_grad(): options_x = [ s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1 ] options_y = [ s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1 ] options_x = [self.bound_x(e) for e in options_x] options_y = [self.bound_y(e) for e in options_y] v_options = v[0, 0, options_x, options_y] option = np.argmax(v_options) if option == 0: target = (s_bar[0], s_bar[1]) elif option == 1: target = (s_bar[0], s_bar[1] + 1) elif option == 2: target = (s_bar[0] + 1, s_bar[1] + 1) elif option == 3: target = (s_bar[0] + 1, s_bar[1]) elif option == 4: target = (s_bar[0] + 1, s_bar[1] - 1) elif option == 5: target = (s_bar[0], s_bar[1] - 1) elif option == 6: target = (s_bar[0] - 1, s_bar[1] - 1) elif option == 7: target = (s_bar[0] - 1, s_bar[1]) elif option == 8: target = (s_bar[0] - 1, s_bar[1] + 1) target = (max(0, min(target[0], self.grid.x_size - 1)), max(0, min(target[1], self.grid.y_size - 1))) roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env( start=s_init, goal=target, horizon=self.time_scale, ultimate_goal=goal_pos, mode='test') s_init = l_state st = l_state[:2] s_bar = self.grid.lls2hls(st) num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) return success def train(self): for iter_loop in range(self.config.num_inner_episodes): batch, g_success, wp_success, steps_trajectory, steps_wp = self.simulate_env( mode='train') self.optimizer.process_batch(self.policy, batch, []) print("G_S, WP_S, G_ST, WP_ST") print(g_success) print(wp_success) print(steps_trajectory) print(steps_wp) print(self.memory.size) print(self.eps) def test(self): g_success = self.simulate_env(mode='test') return g_success