Пример #1
0
def get_policy(weights, n_iter, n_time):
    global model
    global r, v, index_x, index_y, index_vel_theta, index_speed, state_x, state_y, state_vel_theta, state_speed
    model = mdp(np.array([0, 0, 5, 0], dtype='float64'))

    x = np.linspace(-1.5, 1.5, 301, dtype='float64')
    vtheta = np.linspace(-math.pi, math.pi, 101, dtype='float64')
    s = np.linspace(0, 0.1, 11, dtype='float64')

    print 'Creating state space...'
    state_x, state_y, state_vel_theta, state_speed = np.meshgrid(
        x, x, vtheta, s)
    print 'State space created.'

    # plot_x = np.linspace(-1.5,1.5,21, dtype = 'float32')
    # plot_z = np.linspace(0.9, 1, 3, dtype = 'float32')
    # plot_xv, plot_yv, plot_zv = np.meshgrid(plot_x, plot_x, plot_z)
    # print xv.shape

    action_set = []
    for j1 in [-0.01, 0, 0.01]:
        for j2 in [-0.01, 0, 0.01]:
            for j3 in [-0.1, 0, 0.1]:
                # for j4 in [-0.01, 0, 0.01]:
                action_set.append(np.array([j1, j2, j3, 0]))

    # print len(action_set)
    # r = reward(state_x, state_y, state_vel_theta, state_speed)

    r, f = features.reward(state_x, state_y, state_vel_theta, state_speed,
                           weights)

    index_x, index_y, index_vel_theta, index_speed = get_indices(
        state_x, state_y, state_vel_theta, state_speed)

    policy = []
    for iter in range(0, n_iter):
        action_value = []
        policy = []
        print "Policy Iteration:", iter
        # start_time = t.time()

        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            if iter == 0:
                func = initial_loop
            else:
                func = main_loop
            for q, p in executor.map(func, action_set):
                action_value.append(q)
                policy.append(p)

        print "Evaluating Policy..."
        policy = policy / sum(policy)
        v = sum(policy * action_value)
        # end_time = t.time()
        # print end_time-start_time

    # mu = np.empty([301,301,101,11])
    print "Final Policy evaluated."
    print "Calulating State Visitation Frequency..."
    mu = np.exp(-(state_x + 0.15)**2 / 0.25**2) * np.exp(
        -(state_y - 0.27)**2 / 0.5**2) * np.exp(0.004 * state_speed)
    mu_reshape = np.reshape(mu, [301 * 301 * 101 * 11, 1])
    mu = mu / sum(mu_reshape)
    mu_last = mu
    print "Initial State Frequency calculated..."
    for time in range(0, n_time):
        s = np.zeros([301, 301, 101, 11])
        for act_index, action in enumerate(action_set):

            new_state_x, new_state_v, new_state_vel_theta, new_state_speed = model.get_next_state(
                state_x, state_y, state_vel_theta, state_speed, action)

            new_index_x, new_index_y, new_index_vel_theta, new_index_speed = get_indices(
                new_state_x, new_state_v, new_state_vel_theta, new_state_speed)

            p = policy[act_index, index_x, index_y, index_vel_theta,
                       index_speed]
            s = s + p * mu_last[new_index_x, new_index_y, new_index_vel_theta,
                                new_index_speed]
        mu_last = s
        mu = mu + mu_last
    mu = mu / n_time
    state_visitation = mu_last * f
    print "State Visitation Frequency calculated."
    return np.sum(state_visitation.reshape(2, 301 * 301 * 101 * 11),
                  axis=1), policy
Пример #2
0
Z = np.empty([0, 1])
trajectories_probability = np.empty([len(state_trajectories), 1],
                                    dtype='float32')
for n in range(0, n_iterations):
    print "Iteration: ", n
    trajectories_reward = []
    trajectories_features = []
    for state_trajectory in state_trajectories:
        trajectory_reward = np.zeros([1, 1], dtype='float32')
        trajectory_features = np.zeros([2, 1], dtype='float32')
        for iter in range(0, state_trajectory.shape[0]):
            x = np.atleast_2d(state_trajectory[iter, 0])
            y = np.atleast_2d(state_trajectory[iter, 1])
            vtheta = np.atleast_2d(state_trajectory[iter, 2])
            speed = np.atleast_2d(state_trajectory[iter, 3])
            r, f = features.reward(x, y, vtheta, speed, weights)
            trajectory_reward = trajectory_reward + r
            trajectory_features = trajectory_features + np.vstack((f[0], f[1]))
        trajectories_reward.append(trajectory_reward)
        trajectories_features.append(trajectory_features)
    # print trajectory_features
    # print len(trajectories_reward)
    trajectories_probability = np.exp(trajectories_reward)
    feature_state, policy = cudatrial.get_policy(weights, rl_iter, svf_iter)
    # print sum(feature_state.reshape(301*301*101*11,1))
    Z = np.vstack((Z, sum(trajectories_reward)))
    # # trajectories_probability.reshape((len(trajectories_reward),1))
    # L=np.vstack((L,sum(trajectories_reward)/n_traj - np.log(Z)))
    # # if L[n]<L[n-1]:
    # #     break
    #
    print "Iteration: ", n
    trajectories_reward = []
    trajectories_features = []
    trajectory_reward = np.zeros([1, 1], dtype='float32')
    trajectory_features = np.zeros([2, 1], dtype='float32')
    for iter in range(0, state_trajectories.shape[0]):
        rot_par_r = state_trajectories[iter, 0]
        rot_par_p = state_trajectories[iter, 1]
        rot_par_y = state_trajectories[iter, 2]
        end_pos_x = state_trajectories[iter, 3]
        end_pos_y = state_trajectories[iter, 4]
        end_pos_z = state_trajectories[iter, 5]

        r, f = features.reward(
            np.array([
                rot_par_r, rot_par_p, rot_par_y, end_pos_x, end_pos_y,
                end_pos_z
            ]), weights)
        trajectory_reward = trajectory_reward + r
        trajectory_features = trajectory_features + np.vstack((f[0], f[1]))
    trajectories_reward.append(trajectory_reward)
    trajectories_features.append(trajectory_features)
    # print trajectory_features
    # print len(trajectories_reward)
    trajectories_probability = np.exp(trajectories_reward)
    feature_state, policy = mdp_obj.get_policy(weights, rl_iter, svf_iter)
    # print sum(feature_state.reshape(301*301*101*11,1))
    Z = np.vstack((Z, sum(trajectories_reward)))
    # # trajectories_probability.reshape((len(trajectories_reward),1))
    # L=np.vstack((L,sum(trajectories_reward)/n_traj - np.log(Z)))
    # # if L[n]<L[n-1]:
Пример #4
0
    def get_policy(self, weights, n_iter, n_time):

        print 'Creating state space...'
        self.model_state_values = np.meshgrid(self.model_rot_r_val, self.model_rot_p_val, self.model_rot_y_val,
                                              self.model_pos_x_val, self.model_pos_y_val, self.model_pos_z_val,
                                              sparse=True)
        print 'State space created.'

        # Creating action set
        # The rotation values have accuracy of 0.01 and position values have 0.001 accuracy
        for rot_r in [-0.01, 0, 0.01]:
            for rot_p in [-0.01, 0, 0.01]:
                for rot_y in [-0.01, 0, 0.01]:
                    for pos_x in [-0.001, 0, 0.001]:
                        for pos_y in [-0.001, 0, 0.001]:
                            for pos_z in [-0.001, 0, 0.001]:
                                self.action_set.append(np.array([rot_r, rot_p, rot_y, pos_x, pos_y, pos_z]))

        # Get the reward and feature values for all the model state values
        self.r, self.f = features.reward(self.model_state_values, weights)

        # Get the index value for each of the model state value
        self.model_index_values = self.get_indices(self.model_state_values)

        policy = []
        for iter in range(0, n_iter):
            action_value = []
            policy = []
            print "Policy Iteration:", iter
            # start_time = t.time()

            with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
                if iter == 0:
                    func = self.initial_loop
                else:
                    func = self.main_loop
                for q, p in executor.map(func, self.action_set):
                    action_value.append(q)
                    policy.append(p)

            print "Evaluating Policy..."
            policy = policy/sum(policy)
            self.v = sum(policy*action_value)
            # end_time = t.time()
            # print end_time-start_time

        # mu = np.empty([301,301,101,11])
        print "Final Policy evaluated."
        print "Calulating State Visitation Frequency..."
        mu = np.exp(-(float(self.model_state_values[0]))**2)*np.exp(-(float(self.model_state_values[1]))**2) * \
             np.exp(-(float(self.model_state_values[2]))**2)*np.exp(-(float(self.model_state_values[3]))**2) * \
             np.exp(-(float(self.model_state_values[4]))**2)*np.exp(-(float(self.model_state_values[5]))**2)
        mu_reshape = np.reshape(mu, [11*11*11*11*11*11, 1])
        mu = mu/sum(mu_reshape)
        mu_last = mu
        print "Initial State Frequency calculated..."
        for time in range(0, n_time):
            s = np.zeros([11, 11, 11, 11, 11, 11])
            for act_index, action in enumerate(self.action_set):
                new_state_values = self.get_next_state(self.model_state_values, action)

                new_index_values = self.get_indices(new_state_values)

                p = policy[act_index, self.model_index_values]
                s = s + p*mu_last[new_index_values]
            mu_last = s
            mu = mu + mu_last
        mu = mu/n_time
        state_visitation = mu_last*self.f
        print "State Visitation Frequency calculated."
        return np.sum(state_visitation.reshape(2, 11*11*11*11*11*11), axis=1), policy