def find_expected_svf(self, discount, total_states, trajectories, reward):
        # Creates object of robot markov model
        robot_mdp = RobotMarkovModel()
        # Returns the state and action values of the state space being created
        state_values_from_trajectory, _ = robot_mdp.return_trajectories_data()
        # Finds the terminal state value from the expert trajectory
        # 0 indicates that the first trajectory data is being used
        # total_states indicates that the last value of that trajectory data should be used as terminal state
        terminal_state_val_from_trajectory = state_values_from_trajectory[0][
            total_states - 1]
        # Pass the gridsize required
        # To create the state space model, we need to run the below commands
        env_obj = RobotStateUtils(self.grid_size, discount,
                                  terminal_state_val_from_trajectory)
        states = env_obj.create_state_space_model_func()
        action = env_obj.create_action_set_func()
        # print "State space created is ", states
        # V = {}
        # for state in env_obj.states:
        #     V[state] = 0
        # # Reinitialize policy
        # policy = {}
        # for state in env_obj.states:
        #     policy[state] = [key for key in env_obj.action_space]
        # values_of_states, policy = env_obj.value_iteration(V, policy, 0.01)
        # policy = np.random.randint(27, size=1331)
        # print "policy is ", policy
        # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory
        expected_svf = env_obj.compute_state_visitation_frequency(trajectories)

        # Returns the policy, state features of the trajectories considered in the state space and expected svf
        return expected_svf
    def max_ent_irl(self, trajectories, discount, n_trajectories, epochs,
                    learning_rate):
        # Finds the total number of states and dimensions of the list of features array
        # 0 indicates that the first trajectory data is being used
        total_states = len(trajectories[0])
        # total_states indicates that the last value of that trajectory data should be used as terminal state
        terminal_state_val_from_trajectory = trajectories[0][total_states - 1]
        print "Terminal state value is ", terminal_state_val_from_trajectory
        # Creates the environment object created for the robot workspace
        env_obj = RobotStateUtils(self.grid_size, discount,
                                  terminal_state_val_from_trajectory)
        # Creates the states and actions for environment
        states = env_obj.create_state_space_model_func()
        action = env_obj.create_action_set_func()
        print "states is ", states
        print "action are ", action
        filename = "/home/vvarier/dvrk_automated_suturing/max_ent_trial4_parallel_compute_grid_size/trial4_grid11_parallel/weights_grid11.txt"
        # Creates the feature matrix for the given robot environment
        # Currently a identity feature matrix is being used
        feat_map = np.eye(self.grid_size**3)
        # feat_map = env_obj.get_feature_matrix()
        # Initialize with random weights based on the dimensionality of the states
        weights = np.random.uniform(size=(feat_map.shape[1], ))
        # Find feature expectations, sum of features of trajectory/number of trajectories
        feature_expectations = self.find_feature_expectations(
            feat_map, trajectories)
        # print "feature array is ", feature_array_all_trajectories[0][0:total_states]
        # Gradient descent on alpha
        for i in range(epochs):
            if i % 20 == 0:
                print "Epoch running is ", i
                print "weights is ", weights
                file_open = open(filename, 'a')
                savetxt(file_open,
                        weights,
                        delimiter=',',
                        fmt="%10.5f",
                        newline=", ")
                file_open.write("\n \n \n \n")
                file_open.close()
            reward = np.dot(feat_map, weights)
            optimal_policy, expected_svf = self.find_expected_svf(
                weights, discount, total_states, trajectories, reward)
            # Computes the gradient
            grad = feature_expectations - feat_map.T.dot(expected_svf)
            # Change the learning rate based on the iteration running
            if i < 50:
                learning_rate = 0.1
            else:
                learning_rate = 0.1
            # Gradient descent for finding new value of weights
            weights += learning_rate * np.transpose(grad)
            # print "weights is ", weight
        # Compute the reward of the trajectory based on the weights value calculated
        trajectory_reward = np.dot(feat_map, weights)

        return trajectory_reward, weights
    def find_expected_svf(self, weights, discount):
        # Robot Object called
        # Pass the gridsize required
        env_obj = RobotStateUtils(11, weights)
        states = env_obj.create_state_space_model_func()
        action = env_obj.create_action_set_func()
        # print "State space created is ", states
        P_a = env_obj.get_transition_mat_deterministic()
        # print "P_a is ", P_a
        # print "shape of P_a ", P_a.shape
        # Calculates the reward and feature for the trajectories being created in the state space
        rewards = []
        state_features = []
        for i in range(len(states)):
            r, f = env_obj.reward_func(states[i][0], states[i][1], states[i][2], weights)
            rewards.append(r)
            state_features.append(f)
        # print "rewards is ", rewards
        value, policy = env_obj.value_iteration(P_a, rewards, gamma=discount)
        # policy = np.random.randint(27, size=1331)
        print "policy is ", policy
        robot_mdp = RobotMarkovModel()
        # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory
        expert_trajectory_states, _ = robot_mdp.return_trajectories_data()
        expected_svf = env_obj.compute_state_visition_frequency(P_a, expert_trajectory_states, policy)
        # Formats the features array in a way that it can be multiplied with the svf values
        state_features = np.array([state_features]).transpose().reshape((len(state_features[0]), len(state_features)))
        print "svf is ", expected_svf
        print "svf shape is ", expected_svf.shape

        # Returns the policy, state features of the trajectories considered in the state space and expected svf
        return policy, state_features, expected_svf
    def max_ent_irl(self, trajectories, discount, n_trajectories, epochs,
                    learning_rate):
        # Finds the total number of states and dimensions of the list of features array
        # 0 indicates that the first trajectory data is being used
        total_states = len(trajectories[0])
        # total_states indicates that the last value of that trajectory data should be used as terminal state
        terminal_state_val_from_trajectory = trajectories[0][total_states - 1]
        print "Terminal state value is ", terminal_state_val_from_trajectory
        # Creates the environment object created for the robot workspace
        env_obj = RobotStateUtils(self.grid_size, discount,
                                  terminal_state_val_from_trajectory)
        # Creates the states and actions for environment
        states = env_obj.create_state_space_model_func()
        action = env_obj.create_action_set_func()
        print "states is ", states
        print "action are ", action
        # Creates the feature matrix for the given robot environment
        # Currently a identity feature matrix is being used
        feat_map = np.eye(self.grid_size**3)
        # feat_map = env_obj.get_feature_matrix()
        # Initialize with random weights based on the dimensionality of the states
        weights = np.random.uniform(size=(feat_map.shape[1], ))
        # Find feature expectations, sum of features of trajectory/number of trajectories
        feature_expectations = self.find_feature_expectations(
            feat_map, trajectories)
        # print "feature array is ", feature_array_all_trajectories[0][0:total_states]
        # Gradient descent on alpha
        for i in range(epochs):
            print "Epoch running is ", i
            # Multiplies the features with randomized alpha value, size of output Ex: dot(449*449, 449x1)
            reward = np.dot(feat_map, weights)
            # Function which returns the policy being followed and the state visitation frequency
            optimal_policy, expected_svf = self.find_expected_svf(
                env_obj, states, action, reward, discount, trajectories,
                learning_rate)
            # Computes the gradient
            grad = feature_expectations - feat_map.T.dot(expected_svf)
            # Change the learning rate based on the iteration running
            if i < 100:
                learning_rate = 0.1
            else:
                learning_rate = 0.01
            # Gradient descent for finding new value of weights
            weights += learning_rate * np.transpose(grad)
            print "weights is ", weights
        # Compute the reward of the trajectory based on the weights value calculated
        trajectory_reward = np.dot(feat_map, weights)

        return trajectory_reward, weights
    def find_expected_svf(self, weights, discount, total_states, trajectories,
                          reward):
        # Creates object of robot markov model
        robot_mdp = RobotMarkovModel()
        # Returns the state and action values of the state space being created
        state_values_from_trajectory, _ = robot_mdp.return_trajectories_data()
        # Finds the terminal state value from the expert trajectory
        # 0 indicates that the first trajectory data is being used
        # total_states indicates that the last value of that trajectory data should be used as terminal state
        terminal_state_val_from_trajectory = state_values_from_trajectory[0][
            total_states - 1]
        # Pass the gridsize required
        # To create the state space model, we need to run the below commands
        env_obj = RobotStateUtils(self.grid_size, discount,
                                  terminal_state_val_from_trajectory)
        states = env_obj.create_state_space_model_func()
        action = env_obj.create_action_set_func()
        # print "State space created is ", states
        P_a = env_obj.get_transition_mat_deterministic()
        # print "P_a is ", P_a
        # Calculates the reward and feature for the trajectories being created in the state space
        policy = env_obj.value_iteration(reward)
        # policy = np.random.randint(27, size=1331)
        # print "policy is ", policy
        # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory
        expected_svf = env_obj.compute_state_visitation_frequency(
            trajectories, policy)

        # Returns the policy, state features of the trajectories considered in the state space and expected svf
        return policy, expected_svf
    def find_expected_svf(self, grid_size, weights, discount, total_states):
        # Creates object of robot markov model
        robot_mdp = RobotMarkovModel()
        # Returns the state and action values of the state space being created
        state_values_from_trajectory, _ = robot_mdp.return_trajectories_data()
        # Finds the terminal state value from the expert trajectory
        # 0 indicates that the first trajectory data is being used
        # total_states indicates that the last value of that trajectory data should be used as terminal state
        terminal_state_val_from_trajectory = state_values_from_trajectory[0][
            total_states - 1]
        # Pass the gridsize required
        # To create the state space model, we need to run the below commands
        env_obj = RobotStateUtils(grid_size, weights,
                                  terminal_state_val_from_trajectory)
        states = env_obj.create_state_space_model_func()
        action = env_obj.create_action_set_func()
        # print "State space created is ", states
        P_a = env_obj.get_transition_mat_deterministic()
        # print "P_a is ", P_a
        # Calculates the reward and feature for the trajectories being created in the state space
        rewards = []
        state_features = []
        for i in range(len(states)):
            r, f = env_obj.reward_func(states[i][0], states[i][1],
                                       states[i][2], weights)
            rewards.append(r)
            state_features.append(f)
        # print "rewards is ", rewards
        value, policy = env_obj.value_iteration(P_a, rewards, gamma=discount)
        # policy = np.random.randint(27, size=1331)
        print "policy is ", policy
        # Finds the sum of features of the expert trajectory and list of all the features of the expert trajectory
        expert_trajectory_states, _ = robot_mdp.return_trajectories_data()
        expected_svf = env_obj.compute_state_visitation_frequency(
            P_a, expert_trajectory_states, policy)
        # Formats the features array in a way that it can be multiplied with the svf values
        state_features = np.array([state_features]).transpose().reshape(
            (len(state_features[0]), len(state_features)))
        print "svf is ", expected_svf
        print "svf shape is ", expected_svf.shape

        # Returns the policy, state features of the trajectories considered in the state space and expected svf
        return policy, state_features, expected_svf
Exemplo n.º 7
0
import numpy as np
from robot_state_utils import RobotStateUtils
from numpy import savetxt
from robot_markov_model import RobotMarkovModel

if __name__ == '__main__':
    # Location to store the computed policy
    file_name = "/home/vignesh/Desktop/individual_trials/version4/data1/policy_grid11.txt"
    # term_state = np.random.randint(0, grid_size ** 3)]
    goal = np.array([0.005, 0.055, -0.125])
    # Create objects of classes
    env_obj = RobotStateUtils(11, 0.01, goal)
    mdp_obj = RobotMarkovModel()
    # Store the expert trajectories
    trajectories = mdp_obj.generate_trajectories()
    index_vals = np.zeros(len(trajectories[0]))
    for i in range(len(trajectories[0])):
        index_vals[i] = env_obj.get_state_val_index(trajectories[0][i])
    # print index_vals
    states = env_obj.create_state_space_model_func()
    action = env_obj.create_action_set_func()
    rewards = np.zeros(len(states))
    index = env_obj.get_state_val_index(goal)
    for _, ele in enumerate(index_vals):
        if ele == index:
            rewards[int(ele)] = 10
        else:
            rewards[int(ele)] = 1

    policy = env_obj.value_iteration(rewards)
    file_open = open(file_name, 'a')
Exemplo n.º 8
0
    # policy = np.load(folder_dir + 'second_best_policy.npy')
    # Third best policy learnt by Q learning
    # policy = np.load(folder_dir + 'third_best_policy.npy')
    # Miscellaneous good policies learnt by Q learning
    # policy = np.load(folder_dir + 'misc_policy1.npy')
    # policy = np.load(folder_dir + 'misc_policy2.npy')

    # Define the starting 3D coordinate position of agent
    starting_point = np.array([-0.025, 0.05, -0.135])
    # Define the goal position (3D coordinate) of agent
    goal_point = np.array([0.005, 0.055, -0.125])
    # Initialize an array of visited states by agent
    visited_states = starting_point
    # Create the Object of class which creates the state space and action space
    # Pass the required gridsize, discount, terminal_state_val_from_trajectory):
    env_obj = RobotStateUtils(11, 0.1, goal_point)
    states = env_obj.create_state_space_model_func()
    action = env_obj.create_action_set_func()
    # Get the starting and goal position index values in the grid world
    start_index = env_obj.get_state_val_index(starting_point)
    goal_index = env_obj.get_state_val_index(goal_point)
    done = False
    current_state_idx = start_index
    print "policy is ", policy[current_state_idx]
    # Repeat until the agent reaches the goal state
    limit_val = 0
    while not done:
        next_state = []
        # Find the action to take based on the policy learnt
        action_idx = policy[current_state_idx]
        print "action to be taken ", action[action_idx]
Exemplo n.º 9
0
trajectories_z0 = [[0, 0, 0, 0] for i in range(1331)]
default_points = np.array(
    [-0.03, -0.025, -0.02, -0.015, -0.01, -0.005, 0, 0.005, 0.010])

i = 0
# To create 3D points which can have different size
for z in default_points:
    for y in default_points:
        for x in default_points:
            trajectories_z0[i] = [x, y, z, 0.]
            i += 1

# Define the goal position (3D coordinate) of agent
goal = np.array([0.005, 0.055, -0.125])
# Create the Object of class which creates the state space and action space
env_obj = RobotStateUtils(11, 0.1, goal)
states = env_obj.create_state_space_model_func()
action = env_obj.create_action_set_func()
policy = np.zeros(len(states))
rewards = np.zeros(len(states))
index = env_obj.get_state_val_index(goal)
mdp_obj = RobotMarkovModel()
trajectories = mdp_obj.generate_trajectories()
visited_states_index_vals = np.zeros(len(trajectories[0]))
# Find the index values of the visited states
for i in range(len(trajectories[0])):
    visited_states_index_vals[i] = env_obj.get_state_val_index(
        trajectories[0][i])
for _, ele in enumerate(visited_states_index_vals):
    if ele == index:
        rewards[int(ele)] = 10