def logLikelihoodOfTrajectory(trajectory,theta,feature_matrix,gw,discount): n_states, d_states = feature_matrix.shape r = feature_matrix.dot(theta) transition_probability=gw.transition_probability n_actions=gw.n_actions Q = value_iteration.find_policy(n_states, n_actions, transition_probability, r, discount) logLike = 0.0 for i in range( (np.size(trajectory,0)-1) ): start_state=trajectory[i][0] next_state=trajectory[i+1][0] mostProbableAction=0 mostProbableActionsProbability=gw.transition_probability[start_state][0][next_state] for action in range(n_actions): currentActionsProbability=gw.transition_probability[start_state][action][next_state] if(currentActionsProbability>mostProbableActionsProbability): mostProbableActionsProbability=currentActionsProbability mostProbableAction=action actProb = Q[start_state][mostProbableAction] logLike += math.log(actProb) return logLike
def expected_value_difference(n_states, n_actions, transition_probability, reward, discount, p_start_state, optimal_value, true_reward): """ Calculate the expected value difference, which is a proxy to how good a recovered reward function is. n_states: Number of states. int. n_actions: Number of actions. int. transition_probability: NumPy array mapping (state_i, action, state_k) to the probability of transitioning from state_i to state_k under action. Shape (N, A, N). reward: Reward vector mapping state int to reward. Shape (N,). discount: Discount factor. float. p_start_state: Probability vector with the ith component as the probability that the ith state is the start state. Shape (N,). optimal_value: Value vector for the ground reward with optimal policy. The ith component is the value of the ith state. Shape (N,). true_reward: True reward vector. Shape (N,). -> Expected value difference. float. """ policy = value_iteration.find_policy(n_states, n_actions, transition_probability, reward, discount) value = value_iteration.value(policy.argmax(axis=1), n_states, transition_probability, true_reward, discount) evd = optimal_value.dot(p_start_state) - value.dot(p_start_state) return evd
def calculate_policy(self, n_actions, transition_probability, feature_matrix, trajectories, valid_actions={}, return_rewards=False): ##IRL rewards rewards, _, feature_rewards = self.irl(n_actions, transition_probability, feature_matrix, trajectories, valid_actions) ##Reconstruct policy based on learned rewards policy = find_policy(self.n_states, self.n_actions, self.transition_probability, rewards, self.discount, stochastic=False, valid_actions=self.valid_actions, consider_valid_only=True) if return_rewards: return policy, feature_rewards else: return policy
def find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories): """ Find the expected state visitation frequencies """ n_trajectories = trajectories.shape[0] trajectory_length = trajectories.shape[1] # policy = find_policy(n_states, r, n_actions, discount, # transition_probability) policy = value_iteration.find_policy(n_states, n_actions, transition_probability, r, discount) start_state_count = np.zeros(n_states) for trajectory in trajectories: start_state_count[trajectory[0, 0]] += 1 p_start_state = start_state_count / n_trajectories expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T for t in range(1, trajectory_length): expected_svf[:, t] = 0 for i, j, k in product(range(n_states), range(n_actions), range(n_states)): expected_svf[k, t] += ( expected_svf[i, t - 1] * policy[i, j] * # Stochastic policy transition_probability[i, j, k]) return expected_svf.sum(axis=1)
def get_policy(self): ##Calculate optimal policy for gridworld using value iteration rewards = np.dot(self.colors, self.reward_weights) policy, multiple_state_indices = find_policy(self.n_states, self.n_actions, self.transition_probas, rewards, discount=0.9, stochastic=False, valid_actions=self.valid_actions, return_multiple=True) return policy, multiple_state_indices
def find_expected_svf_MC(self, r, p_start_state): ##Calculates probability for each action and state - stochastic policy (lines 1-3 alg 1) policy = find_policy(self.n_states, self.n_actions, self.transition_probability, r, self.discount, valid_actions=self.valid_actions, consider_valid_only=True) ##Initialize svf matrix and lock expected_svf = np.zeros((self.n_states, self.rollout_horizon)) expected_svf[:, 0] = p_start_state if self.par: lock_expected_svf = Lock() ##Run MC rollouts parallely to calculate state visitation frequencies (run 100 threads parllely) rollouts = [] for j in range(0, self.mc_rollouts, 100): if self.par: par_rollouts = 100 if self.mc_rollouts - j > 100 else self.mc_rollouts - j rollouts = [] for i in range(par_rollouts): rollout_i = Thread(name='rollout' + str(i), target=self.MC_rollout_par, args=(p_start_state, policy, expected_svf, lock_expected_svf)) rollouts.append(rollout_i) ##Start all threads for r in rollouts: r.start() ##Wait for all threads to finish for r in rollouts: r.join() else: ##Option not to parallel rollouts.append(self.MC_rollout(p_start_state, policy)) if not self.par: for state_visitation_counts in rollouts: np.add(expected_svf, state_visitation_counts, expected_svf) ##Add in place, right? expected_svf[:, 1:] = expected_svf[:, 1:] / self.mc_rollouts return expected_svf.sum(axis=1), policy
def maxent_irl(sample_paths, feature_matrix, transition_probability, discount, iterations, learning_rate): """ Find the reward function from the list of games (sample_paths) sample_paths: A list of paths. One path = one game feature_matrix: NxD matrix (N = number of states, D = Dimensionality of the state_ transition_probability: NxNxA (N = number of states, A = Number of actions), each element contains P(next state | current state, action a) discount: Discount factor for the MDP iterations: Number of gradient descent steps learning_rate: Gradient descent rate -> Reward vector of size N """ N_STATES, N_FEATURES, N_ACTIONS = np.shape(transition_probability) # Initialize the reward weights to random probabilities since we are going to adjust them as we look at the samples theta = rn.uniform(size=(3)) # Calculate feature expectations feature_expectations = np.zeros(feature_matrix.shape[1]) for path in sample_paths: for state, _, _ in path: feature_expectations += feature_matrix[state] feature_expectations /= sample_paths.shape[ 0] # Divide each element by the total number of paths for _ in range(iterations): # 1. Solve for optimal policy w.r.t. rewards with value iteration rewards = feature_matrix.dot(theta) # Vector of reward values policy = value_iteration.find_policy(N_STATES, N_ACTIONS, transition_probability, rewards, discount) # 2. Solve for state visitation frequences P(s | theta, T) svf = compute_svf(sample_paths, transition_probability, discount, policy) # 3. Compute gradient gradient = feature_expectations - feature_matrix.T.dot(svf) # 4. Update theta with one gradient step theta += learning_rate * gradient # return feature_matrix.dot(theta).reshape((N_STATES, )) return theta
def expected_value_difference(n_states, n_actions, transition_probability, reward, discount, p_start_state, optimal_value, true_reward): """ Calculate the expected value difference, which is a proxy to how good a recovered reward function is. """ policy = value_iteration.find_policy(n_states, n_actions, transition_probability, reward, discount) value = value_iteration.value(policy.argmax(axis=1), n_states, transition_probability, true_reward, discount) evd = optimal_value.dot(p_start_state) - value.dot(p_start_state) return evd
def find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories): """ Find the expected state visitation frequencies using algorithm 1 from Ziebart et al. 2008. n_states: Number of states N. int. alpha: Reward. NumPy array with shape (N,). n_actions: Number of actions A. int. discount: Discount factor of the MDP. float. transition_probability: NumPy array mapping (state_i, action, state_k) to the probability of transitioning from state_i to state_k under action. Shape (N, A, N). trajectories: 3D array of state/action pairs. States are ints, actions are ints. NumPy array with shape (T, L, 2) where T is the number of trajectories and L is the trajectory length. -> Expected state visitation frequencies vector with shape (N,). """ n_trajectories = trajectories.shape[0] trajectory_length = trajectories.shape[1] # policy = find_policy(n_states, r, n_actions, discount, # transition_probability) policy = value_iteration.find_policy(n_states, n_actions, transition_probability, r, discount) start_state_count = np.zeros(n_states) for trajectory in trajectories: start_state_count[trajectory[0]] += 1 p_start_state = start_state_count / n_trajectories expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T for s in xrange(n_states): for t in range(1, len(trajectories[0])): expected_svf[s, t] = sum([ expected_svf[pre_s, t - 1] * transition_probability[pre_s, int(policy[pre_s]), s] for pre_s in xrange(n_states) ]) return expected_svf.sum(axis=1)
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) feature_matrix = ow.feature_matrix(discrete=False) print(feature_matrix) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def find_expected_svf(self, r, p_start_state): ##Calculates probability for each action and state (lines 1-3 alg 1) policy = find_policy(self.n_states, self.n_actions, self.transition_probability, r, self.discount, valid_actions=self.valid_actions, consider_valid_only=True) expected_svf = np.tile(p_start_state, (self.rollout_horizon, 1)).T for t in range(1, self.rollout_horizon): expected_svf[:, t] = 0 for i, j, k in product(range(self.n_states), range(self.n_actions), range(self.n_states)): #line 5 alg 1 expected_svf[k, t] += ( expected_svf[i, t - 1] * policy[i, j] * ##Stochastic policy self.transition_probability[i, j, k]) return expected_svf.sum(axis=1), policy
def repeat_find_policy(N): for _ in range(N): find_policy(N_STATES, rewards, N_ACTIONS, 0.99, tprob)