def policyEvaluation(self, policy): ''' Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than a given threshold. Returns: convergence status as a boolean ''' converged = False policy_evaluation_iteration = 0 while (not converged and self.hasTime() and policy_evaluation_iteration < self.max_PE_iterations): policy_evaluation_iteration += 1 # Sweep The State Space for i in range(0, self.representation.agg_states_num): # Check for solver time if not self.hasTime(): break # Map an state ID to state s = self.representation.stateID2state(i) # Skip terminal states and states with no possible action possible_actions = self.domain.possibleActions(s=s) if (self.domain.isTerminal(s) or len(possible_actions) == 0): continue # Apply Bellman Backup self.BellmanBackup(s, policy.pi(s, False, possible_actions), self.ns_samples, policy) # Update number of backups self.bellmanUpdates += 1 # Check for the performance if self.bellmanUpdates % self.log_interval == 0: performance_return = self.performanceRun()[0] self.logger.info('[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), self.bellmanUpdates, performance_return)) # check for convergence: L_infinity norm of the difference between the to the weight vector of representation weight_vec_change = l_norm( policy.representation.weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # Log Status self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' % (policy_evaluation_iteration, hhmmss(deltaT( self.start_time)), self.bellmanUpdates, weight_vec_change)) # Show Plots if self.show: self.domain.show(policy.pi(s, False, possible_actions), self.representation, s=s) return converged
def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ # TODO resolve this hack if className(self.agent) == 'PolicyEvaluation': # Policy Evaluation Case self.result = self.agent.STATS return random_state = np.random.get_state() #random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0. performance_steps = 0. performance_term = 0. performance_discounted_return = 0. for j in range(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performanceRun( total_steps, visualize=visualize > j) performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=remaining, totreturn=performance_return, steps=performance_steps, num_feat=self.agent.representation.features_num)) np.random.set_state(random_state)
def solve(self): """Solve the domain MDP.""" self.start_time = clock() # Used to track the total time for solving self.bellmanUpdates = 0 converged = False PI_iteration = 0 # The policy is maintained as separate copy of the representation. # This way as the representation is updated the policy remains intact policy = eGreedy(deepcopy(self.representation), epsilon=0, forcedDeterministicAmongBestActions=True) while self.hasTime() and not converged: self.trajectoryBasedPolicyEvaluation(policy) # Policy Improvement (Updating the representation of the value # function will automatically improve the policy PI_iteration += 1 # Theta can increase in size if the representation is expanded hence padding the weight vector with zeros paddedTheta = padZeros(policy.representation.weight_vec, len(self.representation.weight_vec)) # Calculate the change in the weight_vec as L2-norm delta_weight_vec = np.linalg.norm(paddedTheta - self.representation.weight_vec) converged = delta_weight_vec < self.convergence_threshold # Update the underlying value function of the policy policy.representation = deepcopy( self.representation) # self.representation performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.3f, steps=%d, features=%d' % (PI_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, delta_weight_vec, performance_return, performance_steps, self.representation.features_num)) if self.show: self.domain.show(a, representation=self.representation, s=s) # store stats self.result["bellman_updates"].append(self.bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["policy_improvemnt_iteration"].append(PI_iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def trajectoryBasedPolicyEvaluation(self, policy): ''' evaluate the current policy by simulating trajectories and update the value function along the visited states ''' PE_iteration = 0 evaluation_is_accurate = False converged_trajectories = 0 while not evaluation_is_accurate and self.hasTime( ) and PE_iteration < self.max_PE_iterations: # Generate a new episode e-greedy with the current values max_Bellman_Error = 0 step = 0 s, a, terminal = self.sample_ns_na(policy, start_trajectory=True) while not terminal and step < self.domain.episodeCap and self.hasTime( ): new_Q = self.representation.Q_oneStepLookAhead( s, a, self.ns_samples, policy) phi_s = self.representation.phi(s, terminal) phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s=phi_s) old_Q = np.dot(phi_s_a, self.representation.weight_vec) bellman_error = new_Q - old_Q # Update the value function using approximate bellman backup self.representation.weight_vec += (self.alpha * bellman_error * phi_s_a) self.bellmanUpdates += 1 step += 1 max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error)) # Discover features if the representation has the discover method discover_func = getattr( self.representation, 'discover', None ) # None is the default value if the discover is not an attribute if discover_func and callable(discover_func): self.representation.post_discover(phi_s, bellman_error) # if discovered: # print "Features = %d" % self.representation.features_num s, a, terminal = self.sample_ns_na(policy, a) # check for convergence of policy evaluation PE_iteration += 1 if max_Bellman_Error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 evaluation_is_accurate = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d' % (PE_iteration, hhmmss(deltaT( self.start_time)), self.bellmanUpdates, max_Bellman_Error, self.representation.features_num))
def policyImprovement(self, policy): ''' Given a policy improve it by taking the greedy action in each state based on the value function Returns the new policy ''' policyChanges = 0 i = 0 while i < self.representation.agg_states_num and self.hasTime(): s = self.representation.stateID2state(i) if not self.domain.isTerminal(s) and len( self.domain.possibleActions(s)): for a in self.domain.possibleActions(s): if not self.hasTime(): break self.BellmanBackup(s, a, self.ns_samples, policy) if policy.pi(s, False, self.domain.possibleActions( s=s)) != self.representation.bestAction( s, False, self.domain.possibleActions(s=s)): policyChanges += 1 i += 1 # This will cause the policy to be copied over policy.representation.weight_vec = self.representation.weight_vec.copy( ) performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d' % (self.policy_improvement_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, policyChanges, performance_return, performance_steps)) # store stats self.result["bellman_updates"].append(self.bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["policy_improvement_iteration"].append( self.policy_improvement_iteration) return policy, policyChanges
def run(self, visualize_performance=0, visualize_learning=False, visualize_steps=False, debug_on_sigurg=False): if debug_on_sigurg: problems.rlpy.Tools.ipshell.ipdb_on_SIGURG() self.performance_domain = deepcopy(self.domain) self.seed_components() self.result = defaultdict(list) self.result["seed"] = self.exp_id total_steps = 0 eps_steps = 0 eps_return = 0 episode_number = 0 # show policy or value function of initial policy if visualize_learning: self.domain.showLearning(self.agent.representation) # Used to bound the number of logs in the file start_log_time = clock() # Used to show the total time took the process self.start_time = clock() self.elapsed_time = 0 # do a first evaluation to get the quality of the inital policy self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time = 0. terminal = True while total_steps < self.max_steps: if terminal or eps_steps >= self.domain.episodeCap: s, terminal, p_actions = self.domain.s0() a = self.agent.policy.pi(s, terminal, p_actions) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Output the current status if certain amount of time has been # passed eps_return = 0 eps_steps = 0 episode_number += 1 # Act,Step r, ns, terminal, np_actions = self.domain.step(a) self._gather_transition_statistics(s, a, ns, r, learning=True) na = self.agent.policy.pi(ns, terminal, np_actions) total_steps += 1 eps_steps += 1 eps_return += r # Print Current performance if (terminal or eps_steps == self.domain.episodeCap ) and deltaT(start_log_time) > self.log_interval: start_log_time = clock() elapsedTime = deltaT(self.start_time) self.logger.info( self.log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps), totreturn=eps_return, steps=eps_steps, num_feat=self.agent.representation.features_num)) # learning self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal) s, a, p_actions = ns, na, np_actions # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Check Performance if total_steps % (old_div(self.max_steps, self.num_policy_checks)) == 0: self.elapsed_time = deltaT( self.start_time) - self.total_eval_time # show policy or value function if visualize_learning: self.domain.showLearning(self.agent.representation) self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time += deltaT(self.start_time) - \ self.elapsed_time - \ self.total_eval_time start_log_time = clock() # Visual if visualize_steps: self.domain.show(a, self.agent.representation) self.logger.info("Total Experiment_ql Duration %s" % (hhmmss(deltaT(self.start_time)))) return self.agent.representation.weight_vec
def run(self, visualize_performance=0, visualize_learning=False, visualize_steps=False, debug_on_sigurg=False): """ Run the experiment and collect statistics / generate the results :param visualize_performance: (int) determines whether a visualization of the steps taken in performance runs are shown. 0 means no visualization is shown. A value n > 0 means that only the first n performance runs for a specific policy are shown (i.e., for n < checks_per_policy, not all performance runs are shown) :param visualize_learning: (boolean) show some visualization of the learning status before each performance evaluation (e.g. Value function) :param visualize_steps: (boolean) visualize all steps taken during learning :param debug_on_sigurg: (boolean) if true, the ipdb debugger is opened when the python process receives a SIGURG signal. This allows to enter a debugger at any time, e.g. to view data interactively or actual debugging. The feature works only in Unix systems. The signal can be sent with the kill command: kill -URG pid where pid is the process id of the python interpreter running this function. """ if debug_on_sigurg: problems.rlpy.Tools.ipshell.ipdb_on_SIGURG() self.seed_components() self.result = defaultdict(list) self.result["seed"] = self.exp_id total_steps = 0 eps_steps = 0 eps_return = 0 episode_number = 0 # show policy or value function of initial policy if visualize_learning: self.domain.showLearning(self.agent.representation) # Used to bound the number of logs in the file start_log_time = clock() # Used to show the total time took the process self.start_time = clock() self.elapsed_time = 0 # do a first evaluation to get the quality of the inital policy self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time = 0. terminal = True while total_steps < self.max_steps: if terminal or eps_steps >= self.domain.episodeCap: # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') s, terminal, p_actions = self.domain.s0() # print('start train') a = self.agent.policy.pi(s, terminal, p_actions) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Output the current status if certain amount of time has been # passed eps_return = 0 eps_steps = 0 episode_number += 1 # Act,Step r, ns, terminal, np_actions = self.domain.step(a) # print('train', r, s, ns, terminal) self._gather_transition_statistics(s, a, ns, r, learning=True) na = self.agent.policy.pi(ns, terminal, np_actions) total_steps += 1 eps_steps += 1 eps_return += r # Print Current performance if (terminal or eps_steps == self.domain.episodeCap ) and deltaT(start_log_time) > self.log_interval: start_log_time = clock() elapsedTime = deltaT(self.start_time) self.logger.info( self.log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps), totreturn=eps_return, steps=eps_steps, num_feat=self.agent.representation.features_num)) # learning self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal) # if ns[-1]!=s[-1]: print(s, a, r, 'collect flag', ns[-1]) # else: print(s, a, r) # action_num = 4 # flag_num = 2 # gw_size = 70 # state_size = gw_size*(flag_num+1) # for aaa in range(action_num): # actions # print('~~~~~~~~~~~~~~~~~~~~~~~~~') # for fff in range(flag_num+1): # flags # print(self.agent.eligibility_trace[aaa*state_size+fff*gw_size : aaa*state_size+fff*gw_size+gw_size]) s, a, p_actions = ns, na, np_actions # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Check Performance if total_steps % (old_div(self.max_steps, self.num_policy_checks)) == 0: self.elapsed_time = deltaT( self.start_time) - self.total_eval_time # show policy or value function if visualize_learning: self.domain.showLearning(self.agent.representation) self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time += deltaT(self.start_time) - \ self.elapsed_time - \ self.total_eval_time start_log_time = clock() # Visual if visualize_steps: self.domain.show(a, self.agent.representation) self.logger.info("Total Experiment Duration %s" % (hhmmss(deltaT(self.start_time))))
def solve(self): """Solve the domain MDP.""" # Used to show the total time took the process self.start_time = clock() bellmanUpdates = 0 converged = False iteration = 0 # Track the number of consequent trajectories with very small observed # BellmanError converged_trajectories = 0 while self.hasTime() and not converged: # Generate a new episode e-greedy with the current values max_Bellman_Error = 0 step = 0 terminal = False s, terminal, p_actions = self.domain.s0() a = self.representation.bestAction( s, terminal, p_actions ) if np.random.rand() > self.epsilon else randSet(p_actions) while not terminal and step < self.domain.episodeCap and self.hasTime( ): new_Q = self.representation.Q_oneStepLookAhead( s, a, self.ns_samples) phi_s = self.representation.phi(s, terminal) phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s) old_Q = np.dot(phi_s_a, self.representation.weight_vec) bellman_error = new_Q - old_Q # print s, old_Q, new_Q, bellman_error self.representation.weight_vec += self.alpha * bellman_error * phi_s_a bellmanUpdates += 1 step += 1 # Discover features if the representation has the discover method discover_func = getattr( self.representation, 'discover', None ) # None is the default value if the discover is not an attribute if discover_func and callable(discover_func): self.representation.discover(phi_s, bellman_error) max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error)) # Simulate new state and action on trajectory _, s, terminal, p_actions = self.domain.step(a) a = self.representation.bestAction( s, terminal, p_actions ) if np.random.rand() > self.epsilon else randSet(p_actions) # check for convergence iteration += 1 if max_Bellman_Error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f, Steps=%d, Features=%d' % (iteration, hhmmss(deltaT(self.start_time)), bellmanUpdates, max_Bellman_Error, performance_return, performance_steps, self.representation.features_num)) if self.show: self.domain.show(a, representation=self.representation, s=s) # store stats self.result["bellman_updates"].append(bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedValueIteration, self).solve()
def solveInMatrixFormat(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy(self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.hasTime() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collectSamples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.actions_num n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in range(self.samples_num): phi_s_a = self.representation.phi_sa(S[i], T[i], Actions[i, 0]).reshape( (-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' % (iteration, solve_time)) delta_weight_vec = l_norm( new_weight_vec - self.representation.weight_vec, np.inf) converged = delta_weight_vec < self.convergence_threshold self.representation.weight_vec = new_weight_vec performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' % (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return)) if self.show: self.domain.show(S[-1], Actions[-1], self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def solve(self): """Solve the domain MDP.""" self.start_time = clock() # Used to show the total time took the process bellmanUpdates = 0 # used to track the performance improvement. converged = False iteration = 0 # Check for Tabular Representation if not self.IsTabularRepresentation(): self.logger.error("Value Iteration works only with a tabular representation.") return 0 no_of_states = self.representation.agg_states_num while self.hasTime() and not converged: iteration += 1 # Store the weight vector for comparison prev_weight_vec = self.representation.weight_vec.copy() # Sweep The State Space for i in range(no_of_states): s = self.representation.stateID2state(i) # Sweep through possible actions for a in self.domain.possibleActions(s): # Check for available planning time if not self.hasTime(): break self.BellmanBackup(s, a, ns_samples=self.ns_samples) bellmanUpdates += 1 # Create Log if bellmanUpdates % self.log_interval == 0: performance_return, _, _, _ = self.performanceRun() self.logger.info( '[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), bellmanUpdates, performance_return)) # check for convergence weight_vec_change = l_norm(prev_weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # log the stats performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun() self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.4f, Steps=%d' % (iteration, hhmmss(deltaT(self.start_time)), bellmanUpdates, weight_vec_change, performance_return, performance_steps)) # Show the domain and value function if self.show: self.domain.show(a, s=s, representation=self.representation) # store stats self.result["bellman_updates"].append(bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(ValueIteration, self).solve()