def gen_action_list(self, s): # want to uniformly explore actions a_default = self.default_policy.sample_policy(s) a_list = [a_default] for i in range((self.n_actions_per_state - 1) / 2): a_list.append(wrap_to_pi(a_default - (i+1) * self.da_sample)) a_list.append(wrap_to_pi(a_default + (i+1) * self.da_sample)) return a_list
def gen_action_list(self, s): # want to uniformly explore actions a_default = self.default_policy.sample_policy(s) a_list = [a_default] for i in range((self.n_actions_per_state - 1) / 2): a_list.append(wrap_to_pi(a_default - (i + 1) * self.da_sample)) a_list.append(wrap_to_pi(a_default + (i + 1) * self.da_sample)) return a_list
def compute_plan(self, t_max=0.8, flag_with_prediction=False, flag_wait_for_t_max=False): if self.flag_initialized: s = self.s.copy() if flag_with_prediction: s[:2] += np.array([np.cos(s[2]), np.sin(s[2])]) * self.v * t_max s[2] += 0.5 * wrap_to_pi(self.alp_d_mean - s[2]) print "belief is: ", self.alp_d_mean, self.alp_d_cov, "\r" print "predicted state is: ", s, "actual state is: ", self.s, "\r" print "goal is: ", self.mcts_policy.s_g, "\r" res, res_no_comm = self.mcts_policy.generate_policy_parallel(s, (self.alp_d_mean, self.alp_d_cov**0.5), t_max, flag_wait_for_t_max) a_opt, v_comm = res v_no_comm = res_no_comm # fig, axes = plt.subplots(1, 2, figsize=(10, 5)) # self.mcts_policy.visualize_search_tree(self.mcts_policy.policy_tree_root, axes[0]) # self.mcts_policy.visualize_search_tree(self.mcts_policy.root_no_comm, axes[1]) # plt.show() print "Value of communication is: ", v_comm, ", no communication is: ", v_no_comm, "\r" if v_no_comm > v_comm: return None else: return a_opt else: self.mcts_policy.generate_policy(self.s, t_max) a_opt, v_comm = self.mcts_policy.get_policy() self.flag_initialized = True return a_opt
def single_action_sample_example(n_samples, modality): # create a model object model = MovementModel() model.load_model("/home/yuhang/Documents/proactive_guidance/training_data/user0") model.set_default_param() # sample with a few actions T = 6.0 n_dir = 8 alphas = wrap_to_pi(np.pi * 2.0 / n_dir * np.arange(0, n_dir)) all_traj = [] for i in range(n_dir): traj_dir = [] for rep in range(n_samples): traj_dir.append(model.sample_traj_single_action((modality, alphas[i]), 0.5, T)) all_traj.append(traj_dir) fig, axes = plt.subplots(2, 4, figsize=(15, 7)) for i in range(2): for j in range(4): for t, traj in all_traj[i*4+j]: axes[i][j].plot(t, np.asarray(traj)[:, 2]) fig, axes = plt.subplots(2, 4, figsize=(15, 7)) for i in range(2): for j in range(4): for t, traj in all_traj[i*4+j]: d = np.linalg.norm(np.asarray(traj)[:, :2], axis=1) axes[i][j].plot(t, d) plt.show()
def get_value_ang(self, f, x, y, alp): xf = (x - self.x_offset) / self.dx yf = (y - self.y_offset) / self.dy af = (alp - self.alp_offset) / self.dalp xg = int(xf) yg = int(yf) ag = int(af) ag1 = (ag + 1) % self.nAlp xf -= xg yf -= yg af -= ag if xg + 1 >= self.nX or yg + 1 >= self.nY: print "xxx" return self.r_obs f1 = f[xg, yg, ag] + xf * wrap_to_pi(f[xg+1, yg, ag] - f[xg, yg, ag]) f2 = f[xg, yg+1, ag] + xf * wrap_to_pi(f[xg+1, yg+1, ag] - f[xg, yg+1, ag]) g1 = f1 + yf * wrap_to_pi(f2 - f1) f1 = f[xg, yg, ag1] + xf * wrap_to_pi(f[xg+1, yg, ag1] - f[xg, yg, ag1]) f2 = f[xg, yg+1, ag1] + xf * wrap_to_pi(f[xg+1, yg+1, ag1] - f[xg, yg+1, ag1]) g2 = f1 + yf * wrap_to_pi(f2 - f1) return g1 + af * wrap_to_pi(g2 - g1)
def get_value_ang(self, f, x, y, alp): xf = (x - self.x_offset) / self.dx yf = (y - self.y_offset) / self.dy af = (alp - self.alp_offset) / self.dalp xg = int(xf) yg = int(yf) ag = int(af) ag1 = (ag + 1) % self.nAlp xf -= xg yf -= yg af -= ag if xg + 1 >= self.nX or yg + 1 >= self.nY: print "xxx" return self.r_obs f1 = f[xg, yg, ag] + xf * wrap_to_pi(f[xg + 1, yg, ag] - f[xg, yg, ag]) f2 = f[xg, yg + 1, ag] + xf * wrap_to_pi(f[xg + 1, yg + 1, ag] - f[xg, yg + 1, ag]) g1 = f1 + yf * wrap_to_pi(f2 - f1) f1 = f[xg, yg, ag1] + xf * wrap_to_pi(f[xg + 1, yg, ag1] - f[xg, yg, ag1]) f2 = f[xg, yg + 1, ag1] + xf * wrap_to_pi(f[xg + 1, yg + 1, ag1] - f[xg, yg + 1, ag1]) g2 = f1 + yf * wrap_to_pi(f2 - f1) return g1 + af * wrap_to_pi(g2 - g1)
def update_alp(self, s): if self.s_last is None: return # compute measurement x_diff = s[:2] - self.s_last[:2] alp_m = np.arctan2(x_diff[1], x_diff[0]) # alp_m += 0.5 * wrap_to_pi(s[2] - alp_m) print "alp_m is: ", alp_m, "state is: ", s[2], "\r" print "s_last is: ", self.s_last, "\r" d_inc = np.linalg.norm(x_diff) cov_m = max([0.05**2, self.cov_m_base - self.cov_m_k * d_inc]) alp_m += (1.0 - self.cov_m_min / cov_m) * wrap_to_pi(s[2] - alp_m) K = self.alp_d_cov / (self.alp_d_cov + cov_m) self.alp_d_mean += K * wrap_to_pi(alp_m - self.alp_d_mean) self.alp_d_mean = wrap_to_pi(self.alp_d_mean) self.alp_d_cov = (1.0 - K) * (self.alp_d_cov + self.alp_d_process_cov)
def init_value_function(self, s_g): # set obstacle values for xg in range(self.nX): for yg in range(self.nY): for alp_g in range(self.nAlp): self.V[xg, yg, alp_g] = self.r_obs * self.obs[xg, yg] # set goal value xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0) self.V[xgg, ygg] += self.r_goal # perform a BFS to initialize values of all states and obtain an update sequence visited = np.zeros((self.nX, self.nY)) nodes = deque() visited[xgg, ygg] = 1.0 nodes.append((xgg, ygg)) dx = [0, 1, 1, 1, 0, -1, -1, -1] dy = [1, 1, 0, -1, -1, -1, 0, 1] while nodes: xg, yg = nodes.popleft() for i in range(8): xg_next = xg + dx[i] yg_next = yg + dy[i] if xg_next < 0 or yg_next < 0 or xg_next >= self.nX or yg_next >= self.nY: continue if self.obs[xg_next, yg_next] > 0: continue if not visited[xg_next, yg_next]: visited[xg_next, yg_next] = True nodes.append((xg_next, yg_next)) self.update_q.append((xg_next, yg_next)) # update value and initial policy for alp_g in range(self.nAlp): # value is same for all orientation self.V[xg_next, yg_next, alp_g] = self.gamma * self.V[xg, yg, 0] # apply naive policy x, y, alp = self.grid_to_xy(xg_next, yg_next, alp_g) self.policy[xg_next, yg_next, alp_g] = wrap_to_pi( np.arctan2(-dy[i], -dx[i]) - alp)
def init_value_function(self, s_g): # set obstacle values for xg in range(self.nX): for yg in range(self.nY): for alp_g in range(self.nAlp): self.V[xg, yg, alp_g] = self.r_obs * self.obs[xg, yg] # set goal value xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0) self.V[xgg, ygg] += self.r_goal # perform a BFS to initialize values of all states and obtain an update sequence visited = np.zeros((self.nX, self.nY)) nodes = deque() visited[xgg, ygg] = 1.0 nodes.append((xgg, ygg)) dx = [0, 1, 1, 1, 0, -1, -1, -1] dy = [1, 1, 0, -1, -1, -1, 0, 1] while nodes: xg, yg = nodes.popleft() for i in range(8): xg_next = xg + dx[i] yg_next = yg + dy[i] if xg_next < 0 or yg_next < 0 or xg_next >= self.nX or yg_next >= self.nY: continue if self.obs[xg_next, yg_next] > 0: continue if not visited[xg_next, yg_next]: visited[xg_next, yg_next] = True nodes.append((xg_next, yg_next)) self.update_q.append((xg_next, yg_next)) # update value and initial policy for alp_g in range(self.nAlp): # value is same for all orientation self.V[xg_next, yg_next, alp_g] = self.gamma * self.V[xg, yg, 0] # apply naive policy x, y, alp = self.grid_to_xy(xg_next, yg_next, alp_g) self.policy[xg_next, yg_next, alp_g] = wrap_to_pi(np.arctan2(-dy[i], -dx[i]) - alp)
def update_state(self, s_new, t_new): if self.s is None: self.s = s_new.copy() self.t = t_new else: x_inc = s_new[:2] - self.s[:2] v_new = np.linalg.norm(x_inc / (t_new - self.t)) if v_new > 1.0: v_new = 1.0 # print "v_new is: ", v_new, "\r" alp_inc = wrap_to_pi(s_new[2] - self.s[2]) om_new = alp_inc / (t_new - self.t) # simple smoothing of velocities self.v = self.v_smooth_factor * v_new + (1.0 - self.v_smooth_factor) * self.v self.om = self.om_smooth_factor * om_new + (1.0 - self.om_smooth_factor) * self.om self.s = s_new.copy() self.t = t_new
def compute_plan(self, t_max=0.8, flag_with_prediction=False, flag_wait_for_t_max=False): if self.flag_initialized: s = self.s.copy() if flag_with_prediction: s[:2] += np.array([np.cos(s[2]), np.sin(s[2]) ]) * self.v * t_max s[2] += 0.5 * wrap_to_pi(self.alp_d_mean - s[2]) print "belief is: ", self.alp_d_mean, self.alp_d_cov, "\r" print "predicted state is: ", s, "actual state is: ", self.s, "\r" print "goal is: ", self.mcts_policy.s_g, "\r" res, res_no_comm = self.mcts_policy.generate_policy_parallel( s, (self.alp_d_mean, self.alp_d_cov**0.5), t_max, flag_wait_for_t_max) a_opt, v_comm = res v_no_comm = res_no_comm # fig, axes = plt.subplots(1, 2, figsize=(10, 5)) # self.mcts_policy.visualize_search_tree(self.mcts_policy.policy_tree_root, axes[0]) # self.mcts_policy.visualize_search_tree(self.mcts_policy.root_no_comm, axes[1]) # plt.show() print "Value of communication is: ", v_comm, ", no communication is: ", v_no_comm, "\r" if v_no_comm > v_comm: return None else: return a_opt else: self.mcts_policy.generate_policy(self.s, t_max) a_opt, v_comm = self.mcts_policy.get_policy() self.flag_initialized = True return a_opt
def update_state(self, s_new, t_new): if self.s is None: self.s = s_new.copy() self.t = t_new else: x_inc = s_new[:2] - self.s[:2] v_new = np.linalg.norm(x_inc / (t_new - self.t)) if v_new > 1.0: v_new = 1.0 # print "v_new is: ", v_new, "\r" alp_inc = wrap_to_pi(s_new[2] - self.s[2]) om_new = alp_inc / (t_new - self.t) # simple smoothing of velocities self.v = self.v_smooth_factor * v_new + ( 1.0 - self.v_smooth_factor) * self.v self.om = self.om_smooth_factor * om_new + ( 1.0 - self.om_smooth_factor) * self.om self.s = s_new.copy() self.t = t_new
def get_value_func(self, s): return self.get_value(self.V, s[0], s[1], wrap_to_pi(s[2]))
def sample_policy(self, s): return self.get_value_ang(self.policy, s[0], s[1], wrap_to_pi(s[2]))
def sample_state(self, a, dt, T): t, traj = self.sample_traj_single_action(a, dt, T) traj[-1, 2] = wrap_to_pi(traj[-1, 2]) return traj[-1]
def get_n_comm(self, s): return self.get_value(self.n_comm, s[0], s[1], wrap_to_pi(s[2]))
def compute_policy(self, s_g, modality, max_iter=50): if self.flag_policy_computed and self.modality == modality: goal_diff = np.linalg.norm(self.s_g[:2] - s_g[:2]) if goal_diff < 0.5: print "No need to recompute policy" return self.s_g = s_g self.modality = modality if modality == "haptic": self.dt = 2.0 else: self.dt = 3.0 self.init_value_function(s_g) xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0) counter_policy_not_updated = 0 for i in range(max_iter): print "At iteration ", i, "..." flag_policy_update = False # V_curr = self.V # self.V = np.zeros((self.nX, self.nY, self.nAlp)) if i <= self.a_dec_iter: a_range = self.a_range_init - ( self.a_range_init - self.a_range_final) / self.a_dec_iter * i else: a_range = self.a_range_final da = a_range / self.nA print da # iterate over all states for xg, yg in self.update_q: for alp_g in range(self.nAlp): # don't perform update on goal state # if self.obs[xg, yg] > 0: # continue if xg == xgg and yg == ygg: continue x, y, alp = self.grid_to_xy(xg, yg, alp_g) # iterate over all actions Q_max = -1000.0 a_opt = 0.0 n_comm_opt = 0.0 # only sample actions that make sense a_list = wrap_to_pi(da * np.arange(0, self.nA) - a_range / 2.0 + self.policy[xg, yg, alp_g]) for ai, a in enumerate(a_list): Vnext = 0.0 n_comm_next = 0.0 for k in range(self.n_samples): # sample a new state self.tmodel.set_state(x, y, alp) s_next = self.tmodel.sample_state((modality, a), 0.5, self.dt) if s_next[0] < self.x_range[0] or s_next[0] >= self.x_range[1] or \ s_next[1] < self.y_range[0] or s_next[1] >= self.y_range[1]: Vnext += self.r_obs n_comm_next += 20 else: Vnext += self.get_value( self.V, s_next[0], s_next[1], s_next[2]) n_comm_next += self.get_value( self.n_comm, s_next[0], s_next[1], s_next[2]) # if Vnext != 0: # print "here" self.Q[xg, yg, alp_g, ai] = self.gamma * Vnext / self.n_samples n_comm_next /= self.n_samples if self.Q[xg, yg, alp_g, ai] > Q_max: Q_max = self.Q[xg, yg, alp_g, ai] n_comm_opt = n_comm_next a_opt = a # update value function and policy # only update value for non-obstacle states if not self.obs[xg, yg]: self.V[xg, yg, alp_g] = Q_max self.n_comm[xg, yg, alp_g] = n_comm_opt + 1 if self.policy[xg, yg, alp_g] != a_opt: self.policy[xg, yg, alp_g] = a_opt flag_policy_update = True if not flag_policy_update: counter_policy_not_updated += 1 print "Policy not updated in ", counter_policy_not_updated, "iterations" if counter_policy_not_updated >= self.n_update_th: print "Policy converged!" break else: counter_policy_not_updated = 0 # self.visualize_policy() # plt.show() self.flag_policy_computed = True
def sample_policy(self, s): # directly point to the direction of goal alpha_d = np.arctan2(self.s_g[1] - s[1], self.s_g[0] - s[0]) alpha_d = wrap_to_pi(alpha_d - s[2]) return alpha_d
def compute_policy(self, s_g, modality, max_iter=50): if self.flag_policy_computed and self.modality == modality: goal_diff = np.linalg.norm(self.s_g[:2] - s_g[:2]) if goal_diff < 0.5: print "No need to recompute policy" return self.s_g = s_g self.modality = modality if modality == "haptic": self.dt = 2.0 else: self.dt = 3.0 self.init_value_function(s_g) xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0) counter_policy_not_updated = 0 for i in range(max_iter): print "At iteration ", i, "..." flag_policy_update = False # V_curr = self.V # self.V = np.zeros((self.nX, self.nY, self.nAlp)) if i <= self.a_dec_iter: a_range = self.a_range_init - (self.a_range_init - self.a_range_final) / self.a_dec_iter * i else: a_range = self.a_range_final da = a_range / self.nA print da # iterate over all states for xg, yg in self.update_q: for alp_g in range(self.nAlp): # don't perform update on goal state # if self.obs[xg, yg] > 0: # continue if xg == xgg and yg == ygg: continue x, y, alp = self.grid_to_xy(xg, yg, alp_g) # iterate over all actions Q_max = -1000.0 a_opt = 0.0 n_comm_opt = 0.0 # only sample actions that make sense a_list = wrap_to_pi(da * np.arange(0, self.nA) - a_range / 2.0 + self.policy[xg, yg, alp_g]) for ai, a in enumerate(a_list): Vnext = 0.0 n_comm_next = 0.0 for k in range(self.n_samples): # sample a new state self.tmodel.set_state(x, y, alp) s_next = self.tmodel.sample_state((modality, a), 0.5, self.dt) if s_next[0] < self.x_range[0] or s_next[0] >= self.x_range[1] or \ s_next[1] < self.y_range[0] or s_next[1] >= self.y_range[1]: Vnext += self.r_obs n_comm_next += 20 else: Vnext += self.get_value(self.V, s_next[0], s_next[1], s_next[2]) n_comm_next += self.get_value(self.n_comm, s_next[0], s_next[1], s_next[2]) # if Vnext != 0: # print "here" self.Q[xg, yg, alp_g, ai] = self.gamma * Vnext / self.n_samples n_comm_next /= self.n_samples if self.Q[xg, yg, alp_g, ai] > Q_max: Q_max = self.Q[xg, yg, alp_g, ai] n_comm_opt = n_comm_next a_opt = a # update value function and policy # only update value for non-obstacle states if not self.obs[xg, yg]: self.V[xg, yg, alp_g] = Q_max self.n_comm[xg, yg, alp_g] = n_comm_opt + 1 if self.policy[xg, yg, alp_g] != a_opt: self.policy[xg, yg, alp_g] = a_opt flag_policy_update = True if not flag_policy_update: counter_policy_not_updated += 1 print "Policy not updated in ", counter_policy_not_updated, "iterations" if counter_policy_not_updated >= self.n_update_th: print "Policy converged!" break else: counter_policy_not_updated = 0 # self.visualize_policy() # plt.show() self.flag_policy_computed = True