def test_tab_featb_functions(): env = feature_make('FrozenLake8x8-v0') params = np.zeros(64) params[-1] = 1. rf = FeatureBasedRewardFunction(env, params) domain = rf.domain() rf2 = TabularRewardFunction(env, params) rf_true = make_true_reward('FrozenLake8x8-v0') rew1 = rf.reward(domain) rew2 = rf2.reward(domain) rew_true = rf_true.reward(domain) assert np.all(rew_true == rew1) assert np.all(rew1 == rew2) assert rew_true.shape == rew1.shape assert rew1.shape == rew2.shape
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int): """Train algorithm. See abstract base class for parameter types.""" # calculate feature expectations expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0) # start with an agent agent = self.rl_alg_factory(self.env) reward_function = FeatureBasedRewardFunction(self.env, 'random') self.env.update_reward_function(reward_function) theta = reward_function.parameters irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) # compute policy agent.train(no_rl_episodes_per_irl_iteration) policy = agent.policy_array() # compute state visitation frequencies, discard absorbing state svf = self.expected_svf(policy)[:-1] # compute gradients grad = (expert_feature_count - self.feat_map.T.dot(svf)) # update params theta += self.config['lr'] * grad reward_function.update_parameters(theta) evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_function } self.evaluate_metrics(evaluation_input) return theta
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int): """ """ sa_visit_count, P0 = self.sa_visitations() # calculate feature expectations expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0) # initialize the parameters reward_function = FeatureBasedRewardFunction(self.env, 'random') theta = reward_function.parameters agent = self.rl_alg_factory(self.env) irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) reward_wrapper = unwrap_env(self.env, RewardWrapper) reward_wrapper.update_reward_parameters(theta) # compute policy agent.train(no_rl_episodes_per_irl_iteration) policy = agent.policy_array() state_values = agent.state_values q_values = agent.q_values # occupancy measure d = self.occupancy_measure(policy=policy, initial_state_dist=P0)[:-1] # log-likeilihood gradient grad = -(expert_feature_count - np.dot(self.feat_map.T, d)) # graduate descent theta -= self.config['lr'] * grad evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_function } self.evaluate_metrics(evaluation_input) return theta
def test_is_unwrappable_to(): assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit) assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), feature_wrapper.FeatureWrapper) env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, 'random') env = RewardWrapper(env, reward_function) assert is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper) assert is_unwrappable_to(env, DiscreteEnv) assert is_unwrappable_to(env, gym.Env)
def train(self, step_size=1e-2, time_limit=60, n_trajs=10000, verbose=False): '''Train for at most time_limit seconds w/ n_trajs non-expert trajs. Args: step_size -- `float`, size of each gradient ascent step time_limit -- `int`, number of seconds to train n_trajs -- `int`, number of non-expert trajs to be collected verbose -- `bool`, if true print gradient norms and reward weights Returns nothing. ''' t0 = time.time() reward_coefficients = self.reward_function.parameters trajs = collect_trajs(self.env, self.baseline_agent, n_trajs, self.horizon) # Estimate subgradient based on collected trajectories, then # update reward coefficients. if verbose: print('Starting subgradient ascent...') iteration_counter = 0 while time.time() < t0 + time_limit: # replace the previous with the following line when using pdb # for _ in range(50): subgrads = self.subgradients(trajs, reward_coefficients) reward_coefficients += step_size * subgrads reward_coefficients /= np.linalg.norm(reward_coefficients) iteration_counter += 1 if verbose and iteration_counter < 10: print('ITERATION ' + str(iteration_counter) + ' grad norm: ' + str(np.linalg.norm(subgrads))) print('ITERATION ' + str(iteration_counter) + ' reward coefficients: ' + str(reward_coefficients)) if verbose: print('Final reward coefficients: ' + str(reward_coefficients)) self.reward_function = FeatureBasedRewardFunction( self.env_rew, reward_coefficients) self.env_rew.update_reward_function(self.reward_function)
def reward_function_factory(env): return FeatureBasedRewardFunction(env, 'random')
# a one-hot encoding of the state space as features. env = feature_wrapper.make('FrozenLake-v0') # Generate expert trajectories. expert_agent = rl_alg_factory(env) print('Training expert agent...') expert_agent.train(15) print('Done training expert') expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) # you can comment out the previous block if expert data has already # been generated and load the trajectories from file by uncommenting # next 2 lines: # with open(store_to + 'trajs.pkl', 'rb') as f: # expert_trajs = pickle.load(f) # Provide random reward function as initial reward estimate. # This probably isn't really required. reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16)) env = RewardWrapper(env, reward_function) # Run projection algorithm for up to 5 minutes. appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, proj=True) appr_irl.train(time_limit=600, rl_time_per_iteration=45, eps=0, no_trajs=100, max_steps_per_episode=100, verbose=True)
def quick_run_alg(alg_class, config={}): env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, 'random') env = RewardWrapper(env, reward_function) def rl_alg_factory(env): return ValueIteration(env, {}) expert_trajs = [{ 'states': [ 0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9, 8, 8, 9, 10, 14, 15 ], 'actions': [ 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 1, 0, 1 ], 'rewards': [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ], 'true_rewards': [], 'features': [ np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }, { 'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15], 'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1], 'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 'true_rewards': [], 'features': [ np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }] metrics = [] alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config) alg.train(2, 2, 2)
def train(self, time_limit=300, rl_time_per_iteration=30, eps=0, no_trajs=1000, max_steps_per_episode=1000, verbose=False): '''Accumulate feature counts and estimate reward function. Args: time_limit: total training time in seconds rl_time_per_iteration: RL training time per step in seconds. eps: terminate if distance to expert feature counts is below eps. verbose: more verbose prints at runtime if true Returns nothing. ''' t0 = time.time() if verbose: alg_mode = 'projection' if self.proj else 'SVM' print('Running Apprenticeship IRL in mode: ' + alg_mode) # start with random agent: agent = RandomAgent(self.env) iteration_counter = 0 while time.time() < t0 + time_limit: iteration_counter += 1 if verbose: print('ITERATION ' + str(iteration_counter)) trajs = collect_trajs(self.env, agent, no_episodes=no_trajs, max_steps_per_episode=max_steps_per_episode) if verbose: print('Average true reward per episode: ' + str(true_reward_per_traj(trajs))) current_feature_count = self.feature_count(trajs) self.feature_counts.append(current_feature_count) self.labels.append(-1.0) feature_counts = np.array(self.feature_counts) labels = np.array(self.labels) if self.proj: # using projection version of the algorithm if iteration_counter == 1: feature_count_bar = feature_counts[1] else: line = feature_counts[-1] - feature_count_bar feature_count_bar += np.dot( line, feature_counts[0] - feature_count_bar) / np.dot( line, line) * line reward_coefficients = feature_counts[0] - feature_count_bar distance = np.linalg.norm(reward_coefficients) else: # using SVM version of the algorithm ("max-margin" in # the paper, not to be confused with max-margin planning) w = cvx.Variable(feature_counts.shape[1]) b = cvx.Variable() objective = cvx.Minimize(cvx.norm(w, 2)) constraints = [ cvx.multiply(labels, (feature_counts * w + b)) >= 1 ] problem = cvx.Problem(objective, constraints) problem.solve() if w.value is None: print('NO MORE SVM SOLUTION!!') return yResult = feature_counts.dot(w.value) + b.value supportVectorRows = np.where(np.isclose(np.abs(yResult), 1))[0] reward_coefficients = w.value distance = 2 / problem.value if verbose: print('The support vectors are from iterations number ' + str(supportVectorRows)) if verbose: print('Reward coefficients: ' + str(reward_coefficients)) print('Distance: ' + str(distance)) self.distances.append(distance) self.reward_function = FeatureBasedRewardFunction( self.env, reward_coefficients) self.env.update_reward_function(self.reward_function) if distance <= eps: if verbose: print("Feature counts matched within " + str(eps) + ".") break if time.time() + rl_time_per_iteration >= t0 + time_limit: break agent = self.rl_alg_factory(self.env) agent.train(rl_time_per_iteration)
def train(self, feat_map, time_limit=300, rl_time_per_iteration=15, verbose=False): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxN_ACTIONSxN matrix - P_a[s0, a, s1] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recovered state rewards """ t0 = time.time() # init parameters theta = np.random.uniform(size=(feat_map.shape[1], )) # calc feature expectations feat_exp = np.zeros([feat_map.shape[1]]) for episode in self.expert_trajs: for state in episode['states']: feat_exp += feat_map[state] feat_exp = feat_exp / len(self.expert_trajs) #print(feat_exp) agent = self.rl_alg_factory(self.env) # training iteration_counter = 0 while time.time() < t0 + time_limit: iteration_counter += 1 if verbose: print('iteration: {}'.format(iteration_counter)) reward_function_estimate = FeatureBasedRewardFunction( self.env, theta) self.env.update_reward_function(reward_function_estimate) # compute policy agent.train(time_limit=rl_time_per_iteration) policy = agent.pi # compute state visitation frequencies svf = self.expected_svf(policy) # compute gradients grad = -(feat_exp - feat_map.T.dot(svf)) # update params theta += self.lr * grad # return sigmoid(normalize(rewards)) self.reward_function = reward_function_estimate return theta
def reward_function_factory(env): return FeatureBasedRewardFunction(env, true_rews[:-1])
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int): """ """ sa_visit_count, P0 = self.sa_visitations() # mean_s_visit_count = np.sum(sa_visit_count, 1) / len(self.expert_trajs) # calculate feature expectations expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0) # mean_feature_count = np.dot(self.feat_map.T, expert_feature_count ) # initialize the parameters # theta = np.random.rand(self.feat_map.shape[1]) reward_function = FeatureBasedRewardFunction(self.env, 'random') theta = reward_function.parameters agent = self.rl_alg_factory(self.env) irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) reward_function_estimate = FeatureBasedRewardFunction( self.env, theta) self.env.update_reward_function(reward_function_estimate) # compute policy agent.train(no_rl_episodes_per_irl_iteration) policy = agent.policy_array() state_values = agent.state_values q_values = agent.q_values # Log-Likelihood # l = np.sum(sa_visit_count * (q_values - state_values.T)) # check: broadcasting works as intended or not # occupancy measure d = self.occupancy_measure(policy=policy, P0=P0)[:-1] # log-likeilihood gradient grad = -(expert_feature_count - np.dot(self.feat_map.T, d)) # graduate descent theta -= self.config['lr'] * grad evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_function } self.evaluate_metrics(evaluation_input) return theta
from irl_benchmark.irl.algorithms.maxent.me_irl import MaxEnt from irl_benchmark.irl.collect import collect_trajs from irl_benchmark.irl.feature.feature_wrapper import FrozenLakeFeatureWrapper from irl_benchmark.irl.reward.reward_function import FeatureBasedRewardFunction from irl_benchmark.irl.reward.reward_wrapper import RewardWrapper from irl_benchmark.rl.algorithms.value_iteration import ValueIteration from irl_benchmark.utils.utils import get_transition_matrix store_to = 'data/frozen/expert/' no_episodes = 1000 max_steps_per_episode = 1000 env = gym.make('FrozenLake8x8-v0') env = FrozenLakeFeatureWrapper(env) initial_reward_function_estimate = FeatureBasedRewardFunction( env=env, parameters=np.zeros(64)) env = RewardWrapper(env=env, reward_function=initial_reward_function_estimate) # Generate expert trajectories. expert_agent = ValueIteration(env) print('Training expert agent...') expert_agent.train(30) expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) feat_map = np.eye(64) transition_dynamics = get_transition_matrix(env) def rl_alg_factory(env):
def reward_function_factory(env): return FeatureBasedRewardFunction(env, np.zeros(16))
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int ) -> Tuple[BaseRewardFunction, BaseRLAlgorithm]: """Train the apprenticeship learning IRL algorithm. Parameters ---------- no_irl_iterations: int The number of iteration the algorithm should be run. no_rl_episodes_per_irl_iteration: int The number of episodes the RL algorithm is allowed to run in each iteration of the IRL algorithm. no_irl_episodes_per_irl_iteration: int The number of episodes permitted to be run in each iteration to update the current reward estimate (e.g. to estimate state frequencies of the currently optimal policy). Returns ------- Tuple[BaseRewardFunction, BaseRLAlgorithm] The estimated reward function and a RL agent trained for this estimate. """ # Initialize training with a random agent. agent = RandomAgent(self.env) irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) # Estimate feature count of current agent. trajs = collect_trajs( self.env, agent, no_trajectories=no_irl_episodes_per_irl_iteration) current_feature_count = self.feature_count( trajs, gamma=self.config['gamma']) # add new feature count to list of feature counts self.feature_counts.append(current_feature_count) # for SVM mode: self.labels.append(-1.) # convert to numpy array: feature_counts = np.array(self.feature_counts) labels = np.array(self.labels) # update reward coefficients based on mode specified in config: if self.config['mode'] == 'projection': # projection mode: if irl_iteration_counter == 1: # initialize feature_count_bar in first iteration # set to first non-expert feature count: feature_count_bar = feature_counts[1] else: # not first iteration. # calculate line through last feature_count_bar and # last non-expert feature count: line = feature_counts[-1] - feature_count_bar # new feature_count_bar is orthogonal projection of # expert's feature count onto the line: feature_count_bar += np.dot( line, feature_counts[0] - feature_count_bar) / np.dot( line, line) * line reward_coefficients = feature_counts[0] - feature_count_bar # compute distance as L2 norm of reward coefficients (t^(i) in paper): distance = np.linalg.norm(reward_coefficients, ord=2) elif self.config['mode'] == 'svm': # svm mode: # create quadratic programming problem definition: weights = cvx.Variable(feature_counts.shape[1]) bias = cvx.Variable() objective = cvx.Minimize(cvx.norm(weights, 2)) constraints = [ cvx.multiply(labels, (feature_counts * weights + bias)) >= 1 ] problem = cvx.Problem(objective, constraints) # solve quadratic program: problem.solve() if weights.value is None: # TODO: we need to handle empty solution better. raise RuntimeError( 'Empty solution set for linearly separable SVM.') if self.config['verbose']: # print support vectors # (which last iterations where relevant for current result?) svm_classifications = feature_counts.dot( weights.value) + bias.value support_vectors = np.where( np.isclose(np.abs(svm_classifications), 1))[0] print('The support vectors are from iterations number ' + str(support_vectors)) reward_coefficients = weights.value distance = 2 / problem.value else: raise NotImplementedError() if self.config['verbose']: print('Distance: ' + str(distance)) self.distances.append(distance) # create new reward function with current coefficient estimate reward_function = FeatureBasedRewardFunction( self.env, reward_coefficients) # update reward function assert isinstance(self.env, RewardWrapper) self.env.update_reward_function(reward_function) # TODO: see messages with max about order of training & deducing # check stopping criterion: if distance <= self.config['epsilon']: if self.config['verbose']: print("Feature counts matched within " + str(self.config['epsilon']) + ".") break # create new RL-agent agent = self.rl_alg_factory(self.env) # train agent (with new reward function) agent.train(no_rl_episodes_per_irl_iteration) evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_function } self.evaluate_metrics(evaluation_input) return reward_function, agent
def __init__(self, env, expert_trajs, rl_alg_factory, baseline_agent=None, gamma=.8, horizon=20, delta=.05, eps=None): '''Set environment, RL agent factory, expert trajectories, and parameters. Args: env -- wrapped environment; unwrap_env must get envs of the following types from it: gym.Env, FeatureWrapper, RewardWrapper expert_trajs -- `list` of expert trajectories rl_alg_factory -- function that takes an environment and returns an RL agent baseline_agent -- `RLAlgorithm`, used to get non-optimal trajectories. If None, a RandomAgent will be used. gamma -- `float`, discount factor; note that large values won't work well for environments like FrozenLake where discounting is the only incentive to quickly reach the goal state horizon -- `int`, fixed length of trajectories to be considered delta -- confidence that feature count difference between output policy and expert policy is less than 2 * epsilon calc_eps -- `float` or None; if None, then epsilons will be calculated to guarantee matching expert feature counts within epsilon with confidence delta (via Hoeffding's inequality). But this requires the range of feature values. NOTE: Performance of the algorithm might depend on epsilons in a way I don't currently understand, as the epsilons occur in the expression used to approximate the relevant subgradient (ibid., p. 187, eq. 7). ''' # Initialize base class and put remaining args into attributes. super(RelEnt, self).__init__(env, expert_trajs, rl_alg_factory) self.gamma = gamma self.horizon = horizon self.delta = delta # Compute remaining attributes. # Set gym.Env and FeatureWrapper envs as attributes. self.env_gym = unwrap_env(self.env, gym.Env) self.env_feat = unwrap_env(self.env, FeatureWrapper) self.env_rew = unwrap_env(self.env, RewardWrapper) if baseline_agent is not None: self.baseline_agent = baseline_agent else: self.baseline_agent = RandomAgent(self.env_gym) # Set expert trajs, and features. self.n_trajs = len(self.expert_trajs) self.n_features = self.env_feat.feature_shape()[0] assert isinstance(self.n_features, int) # Should be dim of vector. # Initialize random reward function. self.reward_function = FeatureBasedRewardFunction( self.env_rew, np.random.randn(self.n_features)) # Calculate expert feature counts. self.expert_feature_count = self.feature_count(self.expert_trajs) # Set tolerance epsilon (one per feature) for not matching # expert feature counts. self.epsilons = np.zeros(self.n_features) if eps is not None: self.epsilons = eps else: # Calculate epsilons via Hoeffding (ibid., p. 184). max_features = self.env_feat.feature_range()[1] min_features = self.env_feat.feature_range()[0] self.epsilons = max_features - min_features scale = np.sqrt(-np.log(1 - self.delta) / (2 * self.n_trajs)) scale *= (self.gamma**(self.horizon + 1) - 1) / (self.gamma - 1) self.epsilons *= scale
def reward_function_factory(env): params = np.zeros(64) params[-1] = 1. return FeatureBasedRewardFunction(env, params)
def test_value_iteration(): # gamma = 1.0 env = gym.make('FrozenLake-v0') agent = ValueIteration(env, {'gamma': 1.0}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.93 and state_values[14] < 0.95 assert state_values[15] == 0 # gamma = 0.9 env = gym.make('FrozenLake-v0') agent = ValueIteration(env, {'gamma': 0.9}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.63 and state_values[14] < 0.65 # holes and frisbee should have zero value: for i in [5, 7, 11, 12, 15]: assert state_values[i] == 0 # check some q values: # go right in second to last state assert np.argmax(agent.q_values[14, :]) == 1 assert np.min(agent.q_values) == 0 assert np.max(agent.q_values) <= 1 # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # check softmax policy old_state_values = agent.state_values old_q_values = agent.q_values agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1}) agent.train(10) assert np.all(agent.state_values <= old_state_values) # at least initial state should now have lower value: assert agent.state_values[0] < old_state_values[0] assert np.all(agent.q_values <= old_q_values) # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # ordering of probabilities should stay the same with softmax assert np.all( np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i))) # test policy array: policy_array = agent.policy_array() assert policy_array.shape == (17, 4) for i in range(16): assert np.all(agent.policy(i) == policy_array[i, :]) # check if true reward isn't leaked: env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, np.zeros(16)) env = RewardWrapper(env, reward_function) agent = ValueIteration(env, {}) agent.train(10) assert np.sum(agent.state_values == 0)
from irl_benchmark.irl.algorithms.maxent.me_irl import MaxEnt from irl_benchmark.irl.collect import collect_trajs from irl_benchmark.irl.feature.feature_wrapper import FrozenLakeFeatureWrapper from irl_benchmark.irl.reward.reward_function import FeatureBasedRewardFunction from irl_benchmark.irl.reward.reward_wrapper import RewardWrapper from irl_benchmark.metrics.inverse_learning_error import ILE from irl_benchmark.rl.algorithms.value_iteration import ValueIteration from irl_benchmark.utils.utils import get_transition_matrix store_to = 'data/frozen/expert/' no_episodes = 1000 max_steps_per_episode = 1000 env = gym.make('FrozenLake8x8-v0') env = FrozenLakeFeatureWrapper(env) initial_reward_function_estimate = FeatureBasedRewardFunction( env, np.zeros(64)) env = RewardWrapper(env, initial_reward_function_estimate) # Generate expert trajectories. expert_agent = ValueIteration(env) print('Training expert agent...') expert_agent.train(10) expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) feat_map = np.eye(64) transition_dynamics = get_transition_matrix(env) def rl_alg_factory(env):
def maze_world_0(env): parameters = np.array( [REWARD_MOVE, REWARD_SMALL, REWARD_MEDIUM, REWARD_LARGE]) print('Create env for true reward function') return FeatureBasedRewardFunction(env, parameters, action_in_domain=True)
def test_random_featb_function(): env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True) rf = FeatureBasedRewardFunction(env, 'random')