def policyEvaluation(self, policy): ''' Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than a given threshold. Returns: convergence status as a boolean ''' converged = False policy_evaluation_iteration = 0 while (not converged and self.hasTime() and policy_evaluation_iteration < self.max_PE_iterations): policy_evaluation_iteration += 1 # Sweep The State Space for i in xrange(0, self.representation.agg_states_num): # Check for solver time if not self.hasTime(): break # Map an state ID to state s = self.representation.stateID2state(i) # Skip terminal states and states with no possible action possible_actions = self.domain.possibleActions(s=s) if (self.domain.isTerminal(s) or len(possible_actions) == 0): continue # Apply Bellman Backup self.BellmanBackup(s, policy.pi(s, False, possible_actions), self.ns_samples, policy) # Update number of backups self.bellmanUpdates += 1 # Check for the performance if self.bellmanUpdates % self.log_interval == 0: performance_return = self.performanceRun()[0] self.logger.info('[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), self.bellmanUpdates, performance_return)) # check for convergence: L_infinity norm of the difference between the to the weight vector of representation weight_vec_change = l_norm( policy.representation.weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # Log Status self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' % (policy_evaluation_iteration, hhmmss(deltaT( self.start_time)), self.bellmanUpdates, weight_vec_change)) # Show Plots if self.show: self.domain.showDomain(s=s, filename="policy") self.domain.showLearning(self.representation, filename="policy") return converged
def solve(self): """Solve the domain MDP.""" self.start_time = clock() # Used to show the total time took the process bellmanUpdates = 0 # used to track the performance improvement. converged = False iteration = 0 # Check for Tabular Representation if not self.IsTabularRepresentation(): self.logger.error("Value Iteration works only with a tabular representation.") return 0 no_of_states = self.representation.agg_states_num while self.hasTime() and not converged: iteration += 1 # Store the weight vector for comparison prev_weight_vec = self.representation.weight_vec.copy() # Sweep The State Space for i in xrange(no_of_states): s = self.representation.stateID2state(i) # Sweep through possible actions for a in self.domain.possibleActions(s): # Check for available planning time if not self.hasTime(): break self.BellmanBackup(s, a, ns_samples=self.ns_samples) bellmanUpdates += 1 # Create Log if bellmanUpdates % self.log_interval == 0: performance_return, _, _, _ = self.performanceRun() self.logger.info( '[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), bellmanUpdates, performance_return)) # check for convergence weight_vec_change = l_norm(prev_weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # log the stats performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun() self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.4f, Steps=%d' % (iteration, hhmmss(deltaT(self.start_time)), bellmanUpdates, weight_vec_change, performance_return, performance_steps)) # Show the domain and value function if self.show: self.domain.show(a, s=s, representation=self.representation) # store stats self.result["bellman_updates"].append(bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(ValueIteration, self).solve()
def policyEvaluation(self, policy): ''' Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than a given threshold. Returns: convergence status as a boolean ''' converged = False policy_evaluation_iteration = 0 while (not converged and self.hasTime() and policy_evaluation_iteration < self.max_PE_iterations ): policy_evaluation_iteration += 1 # Sweep The State Space for i in range(0, self.representation.agg_states_num): # Check for solver time if not self.hasTime(): break # Map an state ID to state s = self.representation.stateID2state(i) # Skip terminal states and states with no possible action possible_actions = self.domain.possibleActions(s=s) if (self.domain.isTerminal(s) or len(possible_actions) == 0): continue # Apply Bellman Backup self.BellmanBackup( s, policy.pi(s, False, possible_actions), self.ns_samples, policy) # Update number of backups self.bellmanUpdates += 1 # Check for the performance if self.bellmanUpdates % self.log_interval == 0: performance_return = self.performanceRun()[0] self.logger.info( '[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), self.bellmanUpdates, performance_return)) # check for convergence: L_infinity norm of the difference between the to the weight vector of representation weight_vec_change = l_norm(policy.representation.weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # Log Status self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' % (policy_evaluation_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, weight_vec_change)) # Show Plots if self.show: self.domain.show( policy.pi(s, False, possible_actions), self.representation, s=s) return converged
def solveInMatrixFormat(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy(self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.hasTime() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collectSamples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.actions_num n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in xrange(self.samples_num): phi_s_a = self.representation.phi_sa(S[i], T[i], Actions[i, 0]).reshape( (-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' % (iteration, solve_time)) delta_weight_vec = l_norm( new_weight_vec - self.representation.weight_vec, np.inf) converged = delta_weight_vec < self.convergence_threshold self.representation.weight_vec = new_weight_vec performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' % (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return)) if self.show: self.domain.show(S[-1], Actions[-1], self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def solveInMatrixFormat(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy( self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.hasTime() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collectSamples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.actions_num n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in range(self.samples_num): phi_s_a = self.representation.phi_sa( S[i], T[i], Actions[i, 0]).reshape((-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' % (iteration, solve_time)) delta_weight_vec = l_norm(new_weight_vec - self.representation.weight_vec, np.inf) converged = delta_weight_vec < self.convergence_threshold self.representation.weight_vec = new_weight_vec performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun() self.logger.info( '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' % (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return)) if self.show: self.domain.show(S[-1], Actions[-1], self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()