def generate_traj_if_not_exists(self, evaluation_input: dict): assert 'irl_agent' in evaluation_input.keys() if not 'irl_trajs' in evaluation_input: print('generating new trajs for metrics') evaluation_input['irl_trajs'] = collect_trajs( self.env, evaluation_input['irl_agent'], 100) else: print('reuse generated trajs for metric') return evaluation_input['irl_trajs']
def test_tabular_function(): def reward_function_factory(env): params = np.zeros(64) params[-1] = 1. return TabularRewardFunction(env, params) env = make_wrapped_env('FrozenLake8x8-v0', reward_function_factory=reward_function_factory, with_model_wrapper=True) agent = ValueIteration(env) agent.train(1) trajs = collect_trajs(env, agent, 10) for traj in trajs: for i in range(len(traj['rewards'])): assert np.isclose(traj['rewards'][i], traj['true_rewards'][i])
def train(self, step_size=1e-2, time_limit=60, n_trajs=10000, verbose=False): '''Train for at most time_limit seconds w/ n_trajs non-expert trajs. Args: step_size -- `float`, size of each gradient ascent step time_limit -- `int`, number of seconds to train n_trajs -- `int`, number of non-expert trajs to be collected verbose -- `bool`, if true print gradient norms and reward weights Returns nothing. ''' t0 = time.time() reward_coefficients = self.reward_function.parameters trajs = collect_trajs(self.env, self.baseline_agent, n_trajs, self.horizon) # Estimate subgradient based on collected trajectories, then # update reward coefficients. if verbose: print('Starting subgradient ascent...') iteration_counter = 0 while time.time() < t0 + time_limit: # replace the previous with the following line when using pdb # for _ in range(50): subgrads = self.subgradients(trajs, reward_coefficients) reward_coefficients += step_size * subgrads reward_coefficients /= np.linalg.norm(reward_coefficients) iteration_counter += 1 if verbose and iteration_counter < 10: print('ITERATION ' + str(iteration_counter) + ' grad norm: ' + str(np.linalg.norm(subgrads))) print('ITERATION ' + str(iteration_counter) + ' reward coefficients: ' + str(reward_coefficients)) if verbose: print('Final reward coefficients: ' + str(reward_coefficients)) self.reward_function = FeatureBasedRewardFunction( self.env_rew, reward_coefficients) self.env_rew.update_reward_function(self.reward_function)
def generate_traj_if_not_exists(self, evaluation_input: dict): """ Generate trajectories and store them in evaluation input. If evaluation input already contains trajectories, do nothing. Parameters ---------- evaluation_input Returns ------- dict evaluation_input """ assert 'irl_agent' in evaluation_input.keys() if not 'irl_trajs' in evaluation_input: print('generating new trajs for metrics') evaluation_input['irl_trajs'] = collect_trajs( self.env, evaluation_input['irl_agent'], self.no_trajs) else: print('reuse generated trajs for metric') return evaluation_input['irl_trajs']
in the IRL loop.''' return TabularQ(env) # Apprenticeship IRL assumes that rewards are linear in features. # However, FrozenLake doesn't provide features. It is sufficiently small # to work with tabular methods. Therefore, we just use a wrapper that uses # a one-hot encoding of the state space as features. env = feature_wrapper.make('FrozenLake-v0') # Generate expert trajectories. expert_agent = rl_alg_factory(env) print('Training expert agent...') expert_agent.train(15) print('Done training expert') expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) # you can comment out the previous block if expert data has already # been generated and load the trajectories from file by uncommenting # next 2 lines: # with open(store_to + 'trajs.pkl', 'rb') as f: # expert_trajs = pickle.load(f) # Provide random reward function as initial reward estimate. # This probably isn't really required. reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16)) env = RewardWrapper(env, reward_function) # Run projection algorithm for up to 5 minutes. appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, proj=True) appr_irl.train(time_limit=600,
def train(self, time_limit=300, rl_time_per_iteration=30, eps=0, no_trajs=1000, max_steps_per_episode=1000, verbose=False): '''Accumulate feature counts and estimate reward function. Args: time_limit: total training time in seconds rl_time_per_iteration: RL training time per step in seconds. eps: terminate if distance to expert feature counts is below eps. verbose: more verbose prints at runtime if true Returns nothing. ''' t0 = time.time() if verbose: alg_mode = 'projection' if self.proj else 'SVM' print('Running Apprenticeship IRL in mode: ' + alg_mode) # start with random agent: agent = RandomAgent(self.env) iteration_counter = 0 while time.time() < t0 + time_limit: iteration_counter += 1 if verbose: print('ITERATION ' + str(iteration_counter)) trajs = collect_trajs(self.env, agent, no_episodes=no_trajs, max_steps_per_episode=max_steps_per_episode) if verbose: print('Average true reward per episode: ' + str(true_reward_per_traj(trajs))) current_feature_count = self.feature_count(trajs) self.feature_counts.append(current_feature_count) self.labels.append(-1.0) feature_counts = np.array(self.feature_counts) labels = np.array(self.labels) if self.proj: # using projection version of the algorithm if iteration_counter == 1: feature_count_bar = feature_counts[1] else: line = feature_counts[-1] - feature_count_bar feature_count_bar += np.dot( line, feature_counts[0] - feature_count_bar) / np.dot( line, line) * line reward_coefficients = feature_counts[0] - feature_count_bar distance = np.linalg.norm(reward_coefficients) else: # using SVM version of the algorithm ("max-margin" in # the paper, not to be confused with max-margin planning) w = cvx.Variable(feature_counts.shape[1]) b = cvx.Variable() objective = cvx.Minimize(cvx.norm(w, 2)) constraints = [ cvx.multiply(labels, (feature_counts * w + b)) >= 1 ] problem = cvx.Problem(objective, constraints) problem.solve() if w.value is None: print('NO MORE SVM SOLUTION!!') return yResult = feature_counts.dot(w.value) + b.value supportVectorRows = np.where(np.isclose(np.abs(yResult), 1))[0] reward_coefficients = w.value distance = 2 / problem.value if verbose: print('The support vectors are from iterations number ' + str(supportVectorRows)) if verbose: print('Reward coefficients: ' + str(reward_coefficients)) print('Distance: ' + str(distance)) self.distances.append(distance) self.reward_function = FeatureBasedRewardFunction( self.env, reward_coefficients) self.env.update_reward_function(self.reward_function) if distance <= eps: if verbose: print("Feature counts matched within " + str(eps) + ".") break if time.time() + rl_time_per_iteration >= t0 + time_limit: break agent = self.rl_alg_factory(self.env) agent.train(rl_time_per_iteration)
# Run this script to generate all expert data. # FROZEN LAKE: env = feature_wrapper.make('FrozenLake-v0') def rl_alg_factory(env): return ValueIteration(env, {'gamma': 0.9}) expert_agent = rl_alg_factory(env) expert_agent.train(None) expert_trajs = collect_trajs(env, expert_agent, 10000, None, 'data/frozen/expert/', verbose=True) # FROZEN LAKE 8x8: env = feature_wrapper.make('FrozenLake8x8-v0') def rl_alg_factory(env): return ValueIteration(env, {'gamma': 0.9}) expert_agent = rl_alg_factory(env) expert_agent.train(None) expert_trajs = collect_trajs(env, expert_agent,
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int ) -> Tuple[BaseRewardFunction, BaseRLAlgorithm]: """Train the apprenticeship learning IRL algorithm. Parameters ---------- no_irl_iterations: int The number of iteration the algorithm should be run. no_rl_episodes_per_irl_iteration: int The number of episodes the RL algorithm is allowed to run in each iteration of the IRL algorithm. no_irl_episodes_per_irl_iteration: int The number of episodes permitted to be run in each iteration to update the current reward estimate (e.g. to estimate state frequencies of the currently optimal policy). Returns ------- Tuple[BaseRewardFunction, BaseRLAlgorithm] The estimated reward function and a RL agent trained for this estimate. """ # Initialize training with a random agent. agent = RandomAgent(self.env) irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) # Estimate feature count of current agent. trajs = collect_trajs( self.env, agent, no_trajectories=no_irl_episodes_per_irl_iteration) current_feature_count = self.feature_count( trajs, gamma=self.config['gamma']) # add new feature count to list of feature counts self.feature_counts.append(current_feature_count) # for SVM mode: self.labels.append(-1.) # convert to numpy array: feature_counts = np.array(self.feature_counts) labels = np.array(self.labels) # update reward coefficients based on mode specified in config: if self.config['mode'] == 'projection': # projection mode: if irl_iteration_counter == 1: # initialize feature_count_bar in first iteration # set to first non-expert feature count: feature_count_bar = feature_counts[1] else: # not first iteration. # calculate line through last feature_count_bar and # last non-expert feature count: line = feature_counts[-1] - feature_count_bar # new feature_count_bar is orthogonal projection of # expert's feature count onto the line: feature_count_bar += np.dot( line, feature_counts[0] - feature_count_bar) / np.dot( line, line) * line reward_coefficients = feature_counts[0] - feature_count_bar # compute distance as L2 norm of reward coefficients (t^(i) in paper): distance = np.linalg.norm(reward_coefficients, ord=2) elif self.config['mode'] == 'svm': # svm mode: # create quadratic programming problem definition: weights = cvx.Variable(feature_counts.shape[1]) bias = cvx.Variable() objective = cvx.Minimize(cvx.norm(weights, 2)) constraints = [ cvx.multiply(labels, (feature_counts * weights + bias)) >= 1 ] problem = cvx.Problem(objective, constraints) # solve quadratic program: problem.solve() if weights.value is None: # TODO: we need to handle empty solution better. raise RuntimeError( 'Empty solution set for linearly separable SVM.') if self.config['verbose']: # print support vectors # (which last iterations where relevant for current result?) svm_classifications = feature_counts.dot( weights.value) + bias.value support_vectors = np.where( np.isclose(np.abs(svm_classifications), 1))[0] print('The support vectors are from iterations number ' + str(support_vectors)) reward_coefficients = weights.value distance = 2 / problem.value else: raise NotImplementedError() if self.config['verbose']: print('Distance: ' + str(distance)) self.distances.append(distance) # create new reward function with current coefficient estimate reward_function = FeatureBasedRewardFunction( self.env, reward_coefficients) # update reward function assert isinstance(self.env, RewardWrapper) self.env.update_reward_function(reward_function) # TODO: see messages with max about order of training & deducing # check stopping criterion: if distance <= self.config['epsilon']: if self.config['verbose']: print("Feature counts matched within " + str(self.config['epsilon']) + ".") break # create new RL-agent agent = self.rl_alg_factory(self.env) # train agent (with new reward function) agent.train(no_rl_episodes_per_irl_iteration) evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_function } self.evaluate_metrics(evaluation_input) return reward_function, agent
return TabularQ(env) # RelEnt IRL assumes that rewards are linear in features. # However, FrozenLake doesn't provide features. It is sufficiently small # to work with tabular methods. Therefore, we just use a wrapper that uses # a one-hot encoding of the state space as features. env = gym.make('FrozenLake-v0') env = FrozenLakeFeatureWrapper(env) # Generate expert trajectories. expert_agent = rl_alg_factory(env, lp=True) print('Training expert agent...') expert_agent.train(600) print('Done training expert') expert_trajs = collect_trajs(env, expert_agent, no_episodes, max_steps_per_episode, store_to) expert_performance = avg_undiscounted_return(expert_trajs) print('The expert ' + 'reached the goal in ' + str(expert_performance) + ' of trajs.') # you can comment out the previous block if expert data has already # been generated and load the trajectories from file by uncommenting # next 2 lines: # with open(store_to + 'trajs.pkl', 'rb') as f: # expert_trajs = pickle.load(f) # Provide random reward function as initial reward estimate. # This probably isn't really required. n_features = unwrap_env(env, FeatureWrapper).feature_shape()[0] reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=n_features))
# TODO: this is an example for Sayan, delete later. store_to = 'data/frozen/expert/' no_episodes = 500 def rl_alg_factory(env): return ValueIteration(env, {'gamma': 0.9}) env = feature_wrapper.make('FrozenLake-v0') expert_agent = rl_alg_factory(env) expert_agent.train(15) expert_trajs = collect_trajs(env, expert_agent, no_episodes, None, store_to) # wrap env in random reward function to prevent leaking true reward: reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16)) env = RewardWrapper(env, reward_function) # Run projection algorithm for up to 5 minutes. irl_config = {'gamma': 0.9, 'verbose': True} appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, irl_config) reward_function, rl_agent = appr_irl.train( no_irl_iterations=50, no_rl_episodes_per_irl_iteration=no_episodes, no_irl_episodes_per_irl_iteration=no_episodes) print(reward_function.parameters)