def _kifdd_common( agent_class, domain, kernel_resolution, threshold=1.0, lambda_=0.3, initial_learn_rate=0.1, boyan_N0=100, kernel="gaussian", ): kernel_width = (domain.statespace_limits[:, 1] - domain.statespace_limits[:, 0]) / kernel_resolution kifdd = KernelizediFDD( domain, sparsify=True, kernel=getattr(representations, kernel), kernel_args=[kernel_width], active_threshold=0.01, discover_threshold=threshold, normalization=True, max_active_base_feat=10, max_base_feat_sim=0.5, ) return agent_class( eGreedy(kifdd, epsilon=0.1), kifdd, discount_factor=domain.discount_factor, lambda_=lambda_, initial_learn_rate=initial_learn_rate, learn_rate_decay_mode="boyan", boyan_N0=boyan_N0, )
def _make_experiment(exp_id=1, path="./Results/Tmp/test_PST"): """ Each file specifying an experimental setup should contain a make_experiment function which returns an instance of the Experiment class with everything set up. @param id: number used to seed the random number generators @param path: output directory where logs and results are stored """ # Domain: NUM_UAV = 3 domain = PST(NUM_UAV=NUM_UAV) # Representation # discretization only needed for continuous state spaces, discarded otherwise representation = IncrementalTabular(domain) # Policy policy = eGreedy(representation, epsilon=0.1) # Agent agent = SARSA( representation=representation, policy=policy, discount_factor=domain.discount_factor, initial_learn_rate=0.1, ) checks_per_policy = 2 max_steps = 30 num_policy_checks = 2 experiment = Experiment(**locals()) return experiment
def _rbf_common( agent_class, domain, seed=1, num_rbfs=96, resolution=21, initial_learn_rate=0.1, lambda_=0.3, boyan_N0=100, ): rbf = RBF( domain, num_rbfs=num_rbfs, resolution_max=resolution, resolution_min=resolution, const_feature=False, normalize=True, seed=seed, ) return agent_class( eGreedy(rbf, epsilon=0.1), rbf, discount_factor=domain.discount_factor, lambda_=lambda_, initial_learn_rate=initial_learn_rate, learn_rate_decay_mode="boyan", boyan_N0=boyan_N0, )
def _solve_impl(self): """Solve the domain MDP.""" self.bellman_updates = 0 self.policy_improvement_iteration = 0 self.start_time = clock() # Initialize the policy # Copy the representation so that the weight change during the evaluation # does not change the policy policy = eGreedy(deepcopy(self.representation), epsilon=0, deterministic=True) # Setup the number of policy changes to 1 so the while loop starts policy_changes = True while policy_changes and self.has_time(): # Evaluate the policy if self.policy_evaluation(policy): self.logger.info("Converged!") # Improve the policy self.policy_improvement_iteration += 1 policy, policy_changes = self.policy_improvement(policy) self.log_value()
def _ifddk_common( agent_class, domain, epsilon=0.1, discretization=20, threshold=1.0, lambda_=0.3, initial_learn_rate=0.1, boyan_N0=100, ): ifddk = iFDDK( domain, discovery_threshold=threshold, initial_representation=IndependentDiscretization( domain, discretization=discretization), sparsify=True, useCache=True, lazy=True, lambda_=lambda_, ) return agent_class( eGreedy(ifddk, epsilon=epsilon), ifddk, discount_factor=domain.discount_factor, lambda_=lambda_, initial_learn_rate=initial_learn_rate, learn_rate_decay_mode="boyan", boyan_N0=boyan_N0, )
def tabular_q( domain, epsilon=0.1, epsilon_decay=0.0, epsilon_min=0.0, discretization=20, lambda_=0.3, initial_learn_rate=0.1, boyan_N0=100, incremental=False, ): if incremental: tabular = IncrementalTabular(domain, discretization=discretization) else: tabular = Tabular(domain, discretization=discretization) return Q_Learning( eGreedy( tabular, epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min, ), tabular, discount_factor=domain.discount_factor, lambda_=lambda_, initial_learn_rate=initial_learn_rate, learn_rate_decay_mode="boyan", boyan_N0=boyan_N0, )
def _fourier_common( agent_class, domain, order=3, scaling=False, initial_learn_rate=0.1, lambda_=0.3, boyan_N0=100, ): fourier = Fourier(domain, order=order, scaling=scaling) return agent_class( eGreedy(fourier, epsilon=0.1), fourier, discount_factor=domain.discount_factor, lambda_=lambda_, initial_learn_rate=initial_learn_rate, learn_rate_decay_mode="boyan", boyan_N0=boyan_N0, )
def tabular_ucbvi( domain, seed, show_reward=False, epsilon=0.1, epsilon_decay=0.0, epsilon_min=0.0, vi_threshold=1e-6, ): tabular = Tabular(domain, discretization=20) policy = eGreedy(tabular, epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min) return UCBVI(policy, tabular, domain.discount_factor, seed=seed, show_reward=show_reward)
def tile_ggq(domain, res_mat, lambda_=0.3, initial_learn_rate=0.1, boyan_N0=100): tile = TileCoding( domain, memory=2000, num_tilings=[1] * res_mat.shape[0], resolution_matrix=res_mat, safety="none", ) return GreedyGQ( eGreedy(tile, epsilon=0.1), tile, discount_factor=domain.discount_factor, lambda_=lambda_, initial_learn_rate=initial_learn_rate, boyan_N0=boyan_N0, )
def _make_experiment(domain, exp_id=1, path="./Results/Tmp/test_InfTrackCartPole"): ## Representation # discretization only needed for continuous state spaces, discarded otherwise representation = Tabular(domain) ## Policy policy = eGreedy(representation, epsilon=0.2) ## Agent agent = SARSA( representation=representation, policy=policy, discount_factor=domain.discount_factor, initial_learn_rate=0.1, ) checks_per_policy = 3 max_steps = 50 num_policy_checks = 3 experiment = Experiment(**locals()) return experiment
def test_qlearn_valfun_chain(): """ Check if Q-Learning computes the value function of a simple Markov chain correctly. This only tests value function estimation, only one action possible """ rep = MockRepresentation() pol = eGreedy(rep) agent = Q_Learning(pol, rep, 0.9, lambda_=0.0) for i in range(1000): if i % 4 == 3: continue agent.learn( np.array([i % 4]), [0], 0, 1.0, np.array([(i + 1) % 4]), [0], 0, (i + 2) % 4 == 0, ) V_true = np.array([2.71, 1.9, 1, 0]) np.testing.assert_allclose(rep.weight_vec, V_true)
def tabular_mbie_eb( domain, seed, show_reward=False, beta=0.1, epsilon=0.1, epsilon_decay=0.0, epsilon_min=0.0, vi_threshold=1e-6, ): tabular = Tabular(domain, discretization=20) policy = eGreedy(tabular, epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min) return MBIE_EB( policy, tabular, domain.discount_factor, beta=beta, seed=seed, show_reward=show_reward, )
def tabular_opt_psrl( domain, seed, show_reward=False, epsilon=0.1, epsilon_decay=0.0, epsilon_min=0.0, n_samples=10, vi_threshold=1e-6, ): tabular = Tabular(domain, discretization=20) policy = eGreedy(tabular, epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min) return OptimisticPSRL( policy, tabular, domain.discount_factor, seed=seed, show_reward=show_reward, n_samples=n_samples, )
def test_ggq_valfun_chain(): """ Check if Greedy-GQ computes the value function of a simple Markov chain correctly. This only tests value function estimation, only one action possible """ rep = MockRepresentation() pol = eGreedy(rep) agent = GreedyGQ(pol, rep, lambda_=0.0, discount_factor=0.9) for i in range(1000): if i % 4 == 3: agent.episode_terminated() continue agent.learn( np.array([i % 4]), [0], 0, 1.0, np.array([(i + 1) % 4]), [0], 0, (i + 2) % 4 == 0, ) V_true = np.array([2.71, 1.9, 1, 0]) np.testing.assert_allclose(rep.weight_vec, V_true)
def solve_in_matrix_format(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy(self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.has_time() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collect_samples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.num_actions n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in range(self.samples_num): phi_s_a = self.representation.phi_sa(S[i], T[i], Actions[i, 0]).reshape( (-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( "#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)" % (iteration, solve_time)) weight_diff = l_norm(new_weight_vec - self.representation.weight_vec) converged = weight_diff < self.convergence_threshold self.representation.weight_vec = new_weight_vec ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f" % ( iteration, hhmmss(deltaT(self.start_time)), samples, weight_diff, perf_return, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["iteration"].append(iteration) if converged: self.logger.info("Converged!") self.log_value()
def test_deepcopy(): rep = MockRepresentation() pol = eGreedy(rep) agent = SARSA(pol, rep, 0.9, lambda_=0.0) copied_agent = copy.deepcopy(agent) assert agent.lambda_ == copied_agent.lambda_
def tabular_sarsa(domain, discretization=20, lambda_=0.3): tabular = Tabular(domain, discretization=discretization) policy = eGreedy(tabular, epsilon=0.1) return SARSA(policy, tabular, domain.discount_factor, lambda_=lambda_)
def _solve_impl(self): """Solve the domain MDP.""" self.start_time = clock() # Used to track the total time for solving self.bellman_updates = 0 converged = False PI_iteration = 0 # The policy is maintained as separate copy of the representation. # This way as the representation is updated the policy remains intact policy = eGreedy(deepcopy(self.representation), epsilon=0, deterministic=True) a_num = self.domain.num_actions while self.has_time() and not converged: # Policy Improvement (Updating the representation of the value) self.traj_based_policy_evaluation(policy) PI_iteration += 1 # Theta can increase in size if the representation # is expanded hence padding the weight vector with zeros additional_dim = (self.representation.features_num - policy.representation.features_num) padded_theta = np.hstack( (policy.representation.weight, np.zeros( (a_num, additional_dim)))) # Calculate the change in the weight_vec as L2-norm weight_diff = np.linalg.norm(padded_theta - self.representation.weight) converged = weight_diff < self.convergence_threshold # Update the underlying value function of the policy policy.representation = deepcopy( self.representation) # self.representation ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, " "Return=%0.3f, steps=%d, features=%d" % ( PI_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, weight_diff, perf_return, perf_steps, self.representation.features_num, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["bellman_updates"].append(self.bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["policy_improvemnt_iteration"].append(PI_iteration) if converged: self.logger.info("Converged!") self.log_value()
def tabular_lspi(domain, max_steps, discretization=20): tabular = Tabular(domain, discretization=discretization) policy = eGreedy(tabular, epsilon=0.1) return LSPI(policy, tabular, domain.discount_factor, max_steps, 1000)