Exemplo n.º 1
0
    def gen_action_list(self, s):
        # want to uniformly explore actions
        a_default = self.default_policy.sample_policy(s)
        a_list = [a_default]

        for i in range((self.n_actions_per_state - 1) / 2):
            a_list.append(wrap_to_pi(a_default - (i+1) * self.da_sample))
            a_list.append(wrap_to_pi(a_default + (i+1) * self.da_sample))

        return a_list
Exemplo n.º 2
0
    def gen_action_list(self, s):
        # want to uniformly explore actions
        a_default = self.default_policy.sample_policy(s)
        a_list = [a_default]

        for i in range((self.n_actions_per_state - 1) / 2):
            a_list.append(wrap_to_pi(a_default - (i + 1) * self.da_sample))
            a_list.append(wrap_to_pi(a_default + (i + 1) * self.da_sample))

        return a_list
Exemplo n.º 3
0
    def compute_plan(self, t_max=0.8, flag_with_prediction=False, flag_wait_for_t_max=False):
        if self.flag_initialized:
            s = self.s.copy()
            if flag_with_prediction:
                s[:2] += np.array([np.cos(s[2]), np.sin(s[2])]) * self.v * t_max
                s[2] += 0.5 * wrap_to_pi(self.alp_d_mean - s[2])

            print "belief is: ", self.alp_d_mean, self.alp_d_cov, "\r"
            print "predicted state is: ", s, "actual state is: ", self.s, "\r"
            print "goal is: ", self.mcts_policy.s_g, "\r"
            res, res_no_comm = self.mcts_policy.generate_policy_parallel(s,
                                                                         (self.alp_d_mean, self.alp_d_cov**0.5),
                                                                         t_max,
                                                                         flag_wait_for_t_max)
            a_opt, v_comm = res
            v_no_comm = res_no_comm

            # fig, axes = plt.subplots(1, 2, figsize=(10, 5))
            # self.mcts_policy.visualize_search_tree(self.mcts_policy.policy_tree_root, axes[0])
            # self.mcts_policy.visualize_search_tree(self.mcts_policy.root_no_comm, axes[1])
            # plt.show()

            print "Value of communication is: ", v_comm, ", no communication is: ", v_no_comm, "\r"

            if v_no_comm > v_comm:
                return None
            else:
                return a_opt
        else:
            self.mcts_policy.generate_policy(self.s, t_max)
            a_opt, v_comm = self.mcts_policy.get_policy()

            self.flag_initialized = True

            return a_opt
Exemplo n.º 4
0
def single_action_sample_example(n_samples, modality):
    # create a model object
    model = MovementModel()
    model.load_model("/home/yuhang/Documents/proactive_guidance/training_data/user0")
    model.set_default_param()

    # sample with a few actions
    T = 6.0
    n_dir = 8
    alphas = wrap_to_pi(np.pi * 2.0 / n_dir * np.arange(0, n_dir))

    all_traj = []
    for i in range(n_dir):
        traj_dir = []
        for rep in range(n_samples):
            traj_dir.append(model.sample_traj_single_action((modality, alphas[i]), 0.5, T))

        all_traj.append(traj_dir)

    fig, axes = plt.subplots(2, 4, figsize=(15, 7))
    for i in range(2):
        for j in range(4):
            for t, traj in all_traj[i*4+j]:
                axes[i][j].plot(t, np.asarray(traj)[:, 2])

    fig, axes = plt.subplots(2, 4, figsize=(15, 7))
    for i in range(2):
        for j in range(4):
            for t, traj in all_traj[i*4+j]:
                d = np.linalg.norm(np.asarray(traj)[:, :2], axis=1)
                axes[i][j].plot(t, d)

    plt.show()
Exemplo n.º 5
0
    def get_value_ang(self, f, x, y, alp):
        xf = (x - self.x_offset) / self.dx
        yf = (y - self.y_offset) / self.dy
        af = (alp - self.alp_offset) / self.dalp
        xg = int(xf)
        yg = int(yf)
        ag = int(af)
        ag1 = (ag + 1) % self.nAlp

        xf -= xg
        yf -= yg
        af -= ag

        if xg + 1 >= self.nX or yg + 1 >= self.nY:
            print "xxx"
            return self.r_obs

        f1 = f[xg, yg, ag] + xf * wrap_to_pi(f[xg+1, yg, ag] - f[xg, yg, ag])
        f2 = f[xg, yg+1, ag] + xf * wrap_to_pi(f[xg+1, yg+1, ag] - f[xg, yg+1, ag])
        g1 = f1 + yf * wrap_to_pi(f2 - f1)

        f1 = f[xg, yg, ag1] + xf * wrap_to_pi(f[xg+1, yg, ag1] - f[xg, yg, ag1])
        f2 = f[xg, yg+1, ag1] + xf * wrap_to_pi(f[xg+1, yg+1, ag1] - f[xg, yg+1, ag1])
        g2 = f1 + yf * wrap_to_pi(f2 - f1)

        return g1 + af * wrap_to_pi(g2 - g1)
Exemplo n.º 6
0
    def get_value_ang(self, f, x, y, alp):
        xf = (x - self.x_offset) / self.dx
        yf = (y - self.y_offset) / self.dy
        af = (alp - self.alp_offset) / self.dalp
        xg = int(xf)
        yg = int(yf)
        ag = int(af)
        ag1 = (ag + 1) % self.nAlp

        xf -= xg
        yf -= yg
        af -= ag

        if xg + 1 >= self.nX or yg + 1 >= self.nY:
            print "xxx"
            return self.r_obs

        f1 = f[xg, yg, ag] + xf * wrap_to_pi(f[xg + 1, yg, ag] - f[xg, yg, ag])
        f2 = f[xg, yg + 1,
               ag] + xf * wrap_to_pi(f[xg + 1, yg + 1, ag] - f[xg, yg + 1, ag])
        g1 = f1 + yf * wrap_to_pi(f2 - f1)

        f1 = f[xg, yg,
               ag1] + xf * wrap_to_pi(f[xg + 1, yg, ag1] - f[xg, yg, ag1])
        f2 = f[xg, yg + 1, ag1] + xf * wrap_to_pi(f[xg + 1, yg + 1, ag1] -
                                                  f[xg, yg + 1, ag1])
        g2 = f1 + yf * wrap_to_pi(f2 - f1)

        return g1 + af * wrap_to_pi(g2 - g1)
Exemplo n.º 7
0
    def update_alp(self, s):
        if self.s_last is None:
            return

        # compute measurement
        x_diff = s[:2] - self.s_last[:2]
        alp_m = np.arctan2(x_diff[1], x_diff[0])
        # alp_m += 0.5 * wrap_to_pi(s[2] - alp_m)

        print "alp_m is: ", alp_m, "state is: ", s[2], "\r"
        print "s_last is: ", self.s_last, "\r"

        d_inc = np.linalg.norm(x_diff)
        cov_m = max([0.05**2, self.cov_m_base - self.cov_m_k * d_inc])

        alp_m += (1.0 - self.cov_m_min / cov_m) * wrap_to_pi(s[2] - alp_m)

        K = self.alp_d_cov / (self.alp_d_cov + cov_m)
        self.alp_d_mean += K * wrap_to_pi(alp_m - self.alp_d_mean)
        self.alp_d_mean = wrap_to_pi(self.alp_d_mean)

        self.alp_d_cov = (1.0 - K) * (self.alp_d_cov + self.alp_d_process_cov)
Exemplo n.º 8
0
    def update_alp(self, s):
        if self.s_last is None:
            return
            
        # compute measurement
        x_diff = s[:2] - self.s_last[:2]
        alp_m = np.arctan2(x_diff[1], x_diff[0])
        # alp_m += 0.5 * wrap_to_pi(s[2] - alp_m)
        
        print "alp_m is: ", alp_m, "state is: ", s[2], "\r"
        print "s_last is: ", self.s_last, "\r"

        d_inc = np.linalg.norm(x_diff)
        cov_m = max([0.05**2, self.cov_m_base - self.cov_m_k * d_inc])
        
        alp_m += (1.0 - self.cov_m_min / cov_m) * wrap_to_pi(s[2] - alp_m)

        K = self.alp_d_cov / (self.alp_d_cov + cov_m)
        self.alp_d_mean += K * wrap_to_pi(alp_m - self.alp_d_mean)
        self.alp_d_mean = wrap_to_pi(self.alp_d_mean)

        self.alp_d_cov = (1.0 - K) * (self.alp_d_cov + self.alp_d_process_cov)
Exemplo n.º 9
0
    def init_value_function(self, s_g):
        # set obstacle values
        for xg in range(self.nX):
            for yg in range(self.nY):
                for alp_g in range(self.nAlp):
                    self.V[xg, yg, alp_g] = self.r_obs * self.obs[xg, yg]

        # set goal value
        xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0)
        self.V[xgg, ygg] += self.r_goal

        # perform a BFS to initialize values of all states and obtain an update sequence
        visited = np.zeros((self.nX, self.nY))
        nodes = deque()

        visited[xgg, ygg] = 1.0
        nodes.append((xgg, ygg))

        dx = [0, 1, 1, 1, 0, -1, -1, -1]
        dy = [1, 1, 0, -1, -1, -1, 0, 1]
        while nodes:
            xg, yg = nodes.popleft()

            for i in range(8):
                xg_next = xg + dx[i]
                yg_next = yg + dy[i]

                if xg_next < 0 or yg_next < 0 or xg_next >= self.nX or yg_next >= self.nY:
                    continue
                if self.obs[xg_next, yg_next] > 0:
                    continue

                if not visited[xg_next, yg_next]:
                    visited[xg_next, yg_next] = True
                    nodes.append((xg_next, yg_next))
                    self.update_q.append((xg_next, yg_next))

                    # update value and initial policy
                    for alp_g in range(self.nAlp):
                        # value is same for all orientation
                        self.V[xg_next, yg_next,
                               alp_g] = self.gamma * self.V[xg, yg, 0]

                        # apply naive policy
                        x, y, alp = self.grid_to_xy(xg_next, yg_next, alp_g)
                        self.policy[xg_next, yg_next, alp_g] = wrap_to_pi(
                            np.arctan2(-dy[i], -dx[i]) - alp)
Exemplo n.º 10
0
    def init_value_function(self, s_g):
        # set obstacle values
        for xg in range(self.nX):
            for yg in range(self.nY):
                for alp_g in range(self.nAlp):
                    self.V[xg, yg, alp_g] = self.r_obs * self.obs[xg, yg]

        # set goal value
        xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0)
        self.V[xgg, ygg] += self.r_goal

        # perform a BFS to initialize values of all states and obtain an update sequence
        visited = np.zeros((self.nX, self.nY))
        nodes = deque()

        visited[xgg, ygg] = 1.0
        nodes.append((xgg, ygg))

        dx = [0, 1, 1, 1, 0, -1, -1, -1]
        dy = [1, 1, 0, -1, -1, -1, 0, 1]
        while nodes:
            xg, yg = nodes.popleft()

            for i in range(8):
                xg_next = xg + dx[i]
                yg_next = yg + dy[i]

                if xg_next < 0 or yg_next < 0 or xg_next >= self.nX or yg_next >= self.nY:
                    continue
                if self.obs[xg_next, yg_next] > 0:
                    continue

                if not visited[xg_next, yg_next]:
                    visited[xg_next, yg_next] = True
                    nodes.append((xg_next, yg_next))
                    self.update_q.append((xg_next, yg_next))

                    # update value and initial policy
                    for alp_g in range(self.nAlp):
                        # value is same for all orientation
                        self.V[xg_next, yg_next, alp_g] = self.gamma * self.V[xg, yg, 0]

                        # apply naive policy
                        x, y, alp = self.grid_to_xy(xg_next, yg_next, alp_g)
                        self.policy[xg_next, yg_next, alp_g] = wrap_to_pi(np.arctan2(-dy[i], -dx[i]) - alp)
Exemplo n.º 11
0
    def update_state(self, s_new, t_new):
        if self.s is None:
            self.s = s_new.copy()
            self.t = t_new
        else:
            x_inc = s_new[:2] - self.s[:2]
            v_new = np.linalg.norm(x_inc / (t_new - self.t))
            if v_new > 1.0:
                v_new = 1.0
                
            # print "v_new is: ", v_new, "\r"

            alp_inc = wrap_to_pi(s_new[2] - self.s[2])
            om_new = alp_inc / (t_new - self.t)

            # simple smoothing of velocities
            self.v = self.v_smooth_factor * v_new + (1.0 - self.v_smooth_factor) * self.v
            self.om = self.om_smooth_factor * om_new + (1.0 - self.om_smooth_factor) * self.om

            self.s = s_new.copy()
            self.t = t_new
Exemplo n.º 12
0
    def compute_plan(self,
                     t_max=0.8,
                     flag_with_prediction=False,
                     flag_wait_for_t_max=False):
        if self.flag_initialized:
            s = self.s.copy()
            if flag_with_prediction:
                s[:2] += np.array([np.cos(s[2]), np.sin(s[2])
                                   ]) * self.v * t_max
                s[2] += 0.5 * wrap_to_pi(self.alp_d_mean - s[2])

            print "belief is: ", self.alp_d_mean, self.alp_d_cov, "\r"
            print "predicted state is: ", s, "actual state is: ", self.s, "\r"
            print "goal is: ", self.mcts_policy.s_g, "\r"
            res, res_no_comm = self.mcts_policy.generate_policy_parallel(
                s, (self.alp_d_mean, self.alp_d_cov**0.5), t_max,
                flag_wait_for_t_max)
            a_opt, v_comm = res
            v_no_comm = res_no_comm

            # fig, axes = plt.subplots(1, 2, figsize=(10, 5))
            # self.mcts_policy.visualize_search_tree(self.mcts_policy.policy_tree_root, axes[0])
            # self.mcts_policy.visualize_search_tree(self.mcts_policy.root_no_comm, axes[1])
            # plt.show()

            print "Value of communication is: ", v_comm, ", no communication is: ", v_no_comm, "\r"

            if v_no_comm > v_comm:
                return None
            else:
                return a_opt
        else:
            self.mcts_policy.generate_policy(self.s, t_max)
            a_opt, v_comm = self.mcts_policy.get_policy()

            self.flag_initialized = True

            return a_opt
Exemplo n.º 13
0
    def update_state(self, s_new, t_new):
        if self.s is None:
            self.s = s_new.copy()
            self.t = t_new
        else:
            x_inc = s_new[:2] - self.s[:2]
            v_new = np.linalg.norm(x_inc / (t_new - self.t))
            if v_new > 1.0:
                v_new = 1.0

            # print "v_new is: ", v_new, "\r"

            alp_inc = wrap_to_pi(s_new[2] - self.s[2])
            om_new = alp_inc / (t_new - self.t)

            # simple smoothing of velocities
            self.v = self.v_smooth_factor * v_new + (
                1.0 - self.v_smooth_factor) * self.v
            self.om = self.om_smooth_factor * om_new + (
                1.0 - self.om_smooth_factor) * self.om

            self.s = s_new.copy()
            self.t = t_new
Exemplo n.º 14
0
 def get_value_func(self, s):
     return self.get_value(self.V, s[0], s[1], wrap_to_pi(s[2]))
Exemplo n.º 15
0
 def sample_policy(self, s):
     return self.get_value_ang(self.policy, s[0], s[1], wrap_to_pi(s[2]))
Exemplo n.º 16
0
 def get_value_func(self, s):
     return self.get_value(self.V, s[0], s[1], wrap_to_pi(s[2]))
Exemplo n.º 17
0
 def sample_state(self, a, dt, T):
     t, traj = self.sample_traj_single_action(a, dt, T)
     traj[-1, 2] = wrap_to_pi(traj[-1, 2])
     return traj[-1]
Exemplo n.º 18
0
 def get_n_comm(self, s):
     return self.get_value(self.n_comm, s[0], s[1], wrap_to_pi(s[2]))
Exemplo n.º 19
0
 def get_n_comm(self, s):
     return self.get_value(self.n_comm, s[0], s[1], wrap_to_pi(s[2]))
Exemplo n.º 20
0
    def compute_policy(self, s_g, modality, max_iter=50):
        if self.flag_policy_computed and self.modality == modality:
            goal_diff = np.linalg.norm(self.s_g[:2] - s_g[:2])
            if goal_diff < 0.5:
                print "No need to recompute policy"
                return

        self.s_g = s_g
        self.modality = modality

        if modality == "haptic":
            self.dt = 2.0
        else:
            self.dt = 3.0

        self.init_value_function(s_g)
        xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0)

        counter_policy_not_updated = 0
        for i in range(max_iter):
            print "At iteration ", i, "..."

            flag_policy_update = False
            # V_curr = self.V
            # self.V = np.zeros((self.nX, self.nY, self.nAlp))

            if i <= self.a_dec_iter:
                a_range = self.a_range_init - (
                    self.a_range_init -
                    self.a_range_final) / self.a_dec_iter * i
            else:
                a_range = self.a_range_final

            da = a_range / self.nA
            print da

            # iterate over all states
            for xg, yg in self.update_q:
                for alp_g in range(self.nAlp):
                    # don't perform update on goal state
                    # if self.obs[xg, yg] > 0:
                    #     continue
                    if xg == xgg and yg == ygg:
                        continue

                    x, y, alp = self.grid_to_xy(xg, yg, alp_g)

                    # iterate over all actions
                    Q_max = -1000.0
                    a_opt = 0.0
                    n_comm_opt = 0.0

                    # only sample actions that make sense
                    a_list = wrap_to_pi(da * np.arange(0, self.nA) -
                                        a_range / 2.0 +
                                        self.policy[xg, yg, alp_g])

                    for ai, a in enumerate(a_list):
                        Vnext = 0.0
                        n_comm_next = 0.0
                        for k in range(self.n_samples):
                            # sample a new state
                            self.tmodel.set_state(x, y, alp)
                            s_next = self.tmodel.sample_state((modality, a),
                                                              0.5, self.dt)
                            if s_next[0] < self.x_range[0] or s_next[0] >= self.x_range[1] or \
                                            s_next[1] < self.y_range[0] or s_next[1] >= self.y_range[1]:
                                Vnext += self.r_obs
                                n_comm_next += 20
                            else:
                                Vnext += self.get_value(
                                    self.V, s_next[0], s_next[1], s_next[2])
                                n_comm_next += self.get_value(
                                    self.n_comm, s_next[0], s_next[1],
                                    s_next[2])

                        # if Vnext != 0:
                        #     print "here"

                        self.Q[xg, yg, alp_g,
                               ai] = self.gamma * Vnext / self.n_samples
                        n_comm_next /= self.n_samples

                        if self.Q[xg, yg, alp_g, ai] > Q_max:
                            Q_max = self.Q[xg, yg, alp_g, ai]
                            n_comm_opt = n_comm_next
                            a_opt = a

                    # update value function and policy
                    # only update value for non-obstacle states
                    if not self.obs[xg, yg]:
                        self.V[xg, yg, alp_g] = Q_max
                    self.n_comm[xg, yg, alp_g] = n_comm_opt + 1

                    if self.policy[xg, yg, alp_g] != a_opt:
                        self.policy[xg, yg, alp_g] = a_opt
                        flag_policy_update = True

            if not flag_policy_update:
                counter_policy_not_updated += 1
                print "Policy not updated in ", counter_policy_not_updated, "iterations"
                if counter_policy_not_updated >= self.n_update_th:
                    print "Policy converged!"
                    break
            else:
                counter_policy_not_updated = 0

            # self.visualize_policy()
            # plt.show()

        self.flag_policy_computed = True
Exemplo n.º 21
0
    def sample_policy(self, s):
        # directly point to the direction of goal
        alpha_d = np.arctan2(self.s_g[1] - s[1], self.s_g[0] - s[0])
        alpha_d = wrap_to_pi(alpha_d - s[2])

        return alpha_d
Exemplo n.º 22
0
    def compute_policy(self, s_g, modality, max_iter=50):
        if self.flag_policy_computed and self.modality == modality:
            goal_diff = np.linalg.norm(self.s_g[:2] - s_g[:2])
            if goal_diff < 0.5:
                print "No need to recompute policy"
                return

        self.s_g = s_g
        self.modality = modality

        if modality == "haptic":
            self.dt = 2.0
        else:
            self.dt = 3.0

        self.init_value_function(s_g)
        xgg, ygg, tmp = self.xy_to_grid(s_g[0], s_g[1], 0.0)

        counter_policy_not_updated = 0
        for i in range(max_iter):
            print "At iteration ", i, "..."

            flag_policy_update = False
            # V_curr = self.V
            # self.V = np.zeros((self.nX, self.nY, self.nAlp))

            if i <= self.a_dec_iter:
                a_range = self.a_range_init - (self.a_range_init - self.a_range_final) / self.a_dec_iter * i
            else:
                a_range = self.a_range_final

            da = a_range / self.nA
            print da

            # iterate over all states
            for xg, yg in self.update_q:
                for alp_g in range(self.nAlp):
                    # don't perform update on goal state
                    # if self.obs[xg, yg] > 0:
                    #     continue
                    if xg == xgg and yg == ygg:
                        continue

                    x, y, alp = self.grid_to_xy(xg, yg, alp_g)

                    # iterate over all actions
                    Q_max = -1000.0
                    a_opt = 0.0
                    n_comm_opt = 0.0

                    # only sample actions that make sense
                    a_list = wrap_to_pi(da * np.arange(0, self.nA) - a_range / 2.0 + self.policy[xg, yg, alp_g])

                    for ai, a in enumerate(a_list):
                        Vnext = 0.0
                        n_comm_next = 0.0
                        for k in range(self.n_samples):
                            # sample a new state
                            self.tmodel.set_state(x, y, alp)
                            s_next = self.tmodel.sample_state((modality, a), 0.5, self.dt)
                            if s_next[0] < self.x_range[0] or s_next[0] >= self.x_range[1] or \
                                            s_next[1] < self.y_range[0] or s_next[1] >= self.y_range[1]:
                                Vnext += self.r_obs
                                n_comm_next += 20
                            else:
                                Vnext += self.get_value(self.V, s_next[0], s_next[1], s_next[2])
                                n_comm_next += self.get_value(self.n_comm, s_next[0], s_next[1], s_next[2])

                        # if Vnext != 0:
                        #     print "here"

                        self.Q[xg, yg, alp_g, ai] = self.gamma * Vnext / self.n_samples
                        n_comm_next /= self.n_samples

                        if self.Q[xg, yg, alp_g, ai] > Q_max:
                            Q_max = self.Q[xg, yg, alp_g, ai]
                            n_comm_opt = n_comm_next
                            a_opt = a

                    # update value function and policy
                    # only update value for non-obstacle states
                    if not self.obs[xg, yg]:
                        self.V[xg, yg, alp_g] = Q_max
                    self.n_comm[xg, yg, alp_g] = n_comm_opt + 1

                    if self.policy[xg, yg, alp_g] != a_opt:
                        self.policy[xg, yg, alp_g] = a_opt
                        flag_policy_update = True

            if not flag_policy_update:
                counter_policy_not_updated += 1
                print "Policy not updated in ", counter_policy_not_updated, "iterations"
                if counter_policy_not_updated >= self.n_update_th:
                    print "Policy converged!"
                    break
            else:
                counter_policy_not_updated = 0

            # self.visualize_policy()
            # plt.show()

        self.flag_policy_computed = True
Exemplo n.º 23
0
 def sample_policy(self, s):
     return self.get_value_ang(self.policy, s[0], s[1], wrap_to_pi(s[2]))
Exemplo n.º 24
0
    def sample_policy(self, s):
        # directly point to the direction of goal
        alpha_d = np.arctan2(self.s_g[1] - s[1], self.s_g[0] - s[0])
        alpha_d = wrap_to_pi(alpha_d - s[2])

        return alpha_d