def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ random_state = np.random.get_state() # random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0.0 performance_steps = 0.0 performance_term = 0.0 performance_discounted_return = 0.0 for j in range(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performance_run( total_steps, visualize=visualize > j) performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=remaining, totreturn=performance_return, steps=performance_steps, num_feat=self.agent.representation.features_num, )) np.random.set_state(random_state)
def traj_based_policy_evaluation(self, policy): """ Evaluate the current policy by simulating trajectories and update the value function along the visited states. """ PE_iteration = 0 evaluation_is_accurate = False converged_trajectories = 0 while (not evaluation_is_accurate and self.has_time() and PE_iteration < self.max_pe_iterations): # Generate a new episode e-greedy with the current values max_bellman_error = 0 step = 0 s, a, terminal = self.sample_ns_na(policy, start_trajectory=True) while not terminal and step < self.domain.episode_cap and self.has_time( ): bellman_error, phi_s, phi_s_a = self._bellman_error( s, a, terminal) # Update the value function using approximate bellman backup self.representation.weight_vec += self.alpha * bellman_error * phi_s_a self.bellman_updates += 1 step += 1 max_bellman_error = max(max_bellman_error, abs(bellman_error)) # Discover features if the representation has the discover method if hasattr(self.representation, "discover"): self.representation.post_discover(phi_s, bellman_error) s, a, terminal = self.sample_ns_na(policy, a) # check for convergence of policy evaluation PE_iteration += 1 if max_bellman_error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 evaluation_is_accurate = (converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES) self.logger.info( "PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d" % ( PE_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, max_bellman_error, self.representation.features_num, ))
def policy_improvement(self, policy): """ Given a policy improve it by taking the greedy action in each state based on the value function. Returns the new policy. """ policyChanges = 0 i = 0 while i < self.representation.num_states_total and self.has_time(): s = self.representation.stateID2state(i) p_actions = self.domain.possible_actions(s) if not self.domain.is_terminal(s) and len( self.domain.possible_actions(s)): for a in self.domain.possible_actions(s): self.bellman_backup(s, a, self.ns_samples, policy) p_actions = self.domain.possible_actions(s=s) best_action = self.representation.best_action( s, False, p_actions) if policy.pi(s, False, p_actions) != best_action: policyChanges += 1 i += 1 # This will cause the policy to be copied over policy.representation.weight = self.representation.weight.copy() perf_return, perf_steps, perf_term, perf_disc_return = self.performance_run( ) self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d" % ( self.policy_improvement_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, policyChanges, perf_return, perf_steps, )) # store stats self.result["bellman_updates"].append(self.bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["policy_improvement_iteration"].append( self.policy_improvement_iteration) return policy, policyChanges
def run(self, visualize_performance=0, visualize_learning=False, visualize_steps=False): """ Run the experiment and collect statistics / generate the results :param visualize_performance: (int) determines whether a visualization of the steps taken in performance runs are shown. 0 means no visualization is shown. A value n > 0 means that only the first n performance runs for a specific policy are shown (i.e., for n < checks_per_policy, not all performance runs are shown) :param visualize_learning: (boolean) show some visualization of the learning status before each performance evaluation (e.g. Value function) :param visualize_steps: (boolean) visualize all steps taken during learning """ self.performance_domain = deepcopy(self.domain) self.performance_domain.performance = True self.seed_components() self.result = defaultdict(list) self.result["seed"] = self.exp_id total_steps = 0 eps_steps = 0 eps_return = 0 episode_number = 0 # show policy or value function of initial policy if visualize_learning: self.domain.show_learning(self.agent.representation) # Used to bound the number of logs in the file start_log_time = clock() # Used to show the total time took the process self.start_time = clock() self.elapsed_time = 0 # do a first evaluation to get the quality of the inital policy self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time = 0.0 terminal = True while total_steps < self.max_steps: if terminal or eps_steps >= self.domain.episode_cap: s, terminal, p_actions = self.domain.s0() a = self.agent.policy.pi(s, terminal, p_actions) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Output the current status if certain amount of time has been # passed eps_return = 0 eps_steps = 0 episode_number += 1 # Act,Step r, ns, terminal, np_actions = self.domain.step(a) self._gather_transition_statistics(s, a, ns, r, learning=True) na = self.agent.policy.pi(ns, terminal, np_actions) total_steps += 1 eps_steps += 1 eps_return += r # Print Current performance if (terminal or eps_steps == self.domain.episode_cap ) and deltaT(start_log_time) > self.log_interval: start_log_time = clock() elapsedTime = deltaT(self.start_time) self.logger.info( self.log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps), totreturn=eps_return, steps=eps_steps, num_feat=self.agent.representation.features_num, )) # learning self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal) s, a, p_actions = ns, na, np_actions # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Check Performance if total_steps % (self.max_steps // self.num_policy_checks) == 0: self.elapsed_time = deltaT( self.start_time) - self.total_eval_time # show policy or value function if visualize_learning: self.domain.show_learning(self.agent.representation) self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time += (deltaT(self.start_time) - self.elapsed_time - self.total_eval_time) start_log_time = clock() # Visual if visualize_steps: self.domain.show(a, self.agent.representation) self.logger.info("Total Experiment Duration %s" % (hhmmss(deltaT(self.start_time))))
def solve_in_matrix_format(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy(self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.has_time() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collect_samples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.num_actions n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in range(self.samples_num): phi_s_a = self.representation.phi_sa(S[i], T[i], Actions[i, 0]).reshape( (-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( "#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)" % (iteration, solve_time)) weight_diff = l_norm(new_weight_vec - self.representation.weight_vec) converged = weight_diff < self.convergence_threshold self.representation.weight_vec = new_weight_vec ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f" % ( iteration, hhmmss(deltaT(self.start_time)), samples, weight_diff, perf_return, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["iteration"].append(iteration) if converged: self.logger.info("Converged!") self.log_value()
def _solve_impl(self): """Solve the domain MDP.""" self.start_time = clock() # Used to track the total time for solving self.bellman_updates = 0 converged = False PI_iteration = 0 # The policy is maintained as separate copy of the representation. # This way as the representation is updated the policy remains intact policy = eGreedy(deepcopy(self.representation), epsilon=0, deterministic=True) a_num = self.domain.num_actions while self.has_time() and not converged: # Policy Improvement (Updating the representation of the value) self.traj_based_policy_evaluation(policy) PI_iteration += 1 # Theta can increase in size if the representation # is expanded hence padding the weight vector with zeros additional_dim = (self.representation.features_num - policy.representation.features_num) padded_theta = np.hstack( (policy.representation.weight, np.zeros( (a_num, additional_dim)))) # Calculate the change in the weight_vec as L2-norm weight_diff = np.linalg.norm(padded_theta - self.representation.weight) converged = weight_diff < self.convergence_threshold # Update the underlying value function of the policy policy.representation = deepcopy( self.representation) # self.representation ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, " "Return=%0.3f, steps=%d, features=%d" % ( PI_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, weight_diff, perf_return, perf_steps, self.representation.features_num, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["bellman_updates"].append(self.bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["policy_improvemnt_iteration"].append(PI_iteration) if converged: self.logger.info("Converged!") self.log_value()
def policy_evaluation(self, policy): """ Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than a given threshold. Returns: convergence status as a boolean """ converged = False policy_evaluation_iteration = 0 while (not converged and self.has_time() and policy_evaluation_iteration < self.max_pe_iterations): policy_evaluation_iteration += 1 # Sweep The State Space for i in range(0, self.representation.num_states_total): # Check for solver time if not self.has_time(): break # Map an state ID to state s = self.representation.stateID2state(i) # Skip terminal states and states with no possible action possible_actions = self.domain.possible_actions(s=s) if self.domain.is_terminal(s) or len(possible_actions) == 0: continue # Apply Bellman Backup self.bellman_backup(s, policy.pi(s, False, possible_actions), self.ns_samples, policy) # Update number of backups self.bellman_updates += 1 # Check for the performance if self.bellman_updates % self.log_interval == 0: performance_return = self.performance_run()[0] self.logger.info("[%s]: BellmanUpdates=%d, Return=%0.4f" % ( hhmmss(deltaT(self.start_time)), self.bellman_updates, performance_return, )) # check for convergence: L_infinity norm of the difference between the to # the weight vector of representation weight_diff = l_norm(policy.representation.weight - self.representation.weight) converged = weight_diff < self.convergence_threshold # Log Status self.logger.info( "PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f" % ( policy_evaluation_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, weight_diff, )) # Show Plots if self._visualize_mode: self.domain.show_learning(self.representation) return converged
def _solve_impl(self): """Solve the domain MDP.""" self.start_time = clock( ) # Used to show the total time took the process bellman_updates = 0 # used to track the performance improvement. converged = False iteration = 0 num_states = self.representation.num_states_total while self.has_time() and not converged: iteration += 1 # Store the weight vector for comparison prev_weight = self.representation.weight.copy() # Sweep The State Space for i in range(num_states): s = self.representation.stateID2state(i) # Sweep through possible actions if self.domain.is_terminal(s): continue for a in self.domain.possible_actions(s): self.bellman_backup(s, a, ns_samples=self.ns_samples) bellman_updates += 1 # Create Log if bellman_updates % self.log_interval == 0: performance_return, _, _, _ = self.performance_run() self._log_updates(performance_return, bellman_updates) # check for convergence weight_diff = l_norm(prev_weight - self.representation.weight) converged = weight_diff < self.convergence_threshold # log the stats ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, " "Return=%0.4f, Steps=%d" % ( iteration, hhmmss(deltaT(self.start_time)), bellman_updates, weight_diff, perf_return, perf_steps, )) # Show the domain and value function if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["bellman_updates"].append(bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["iteration"].append(iteration) if converged: self.logger.info("Converged!") self.log_value()
def _log_updates(self, perf_return, bellman_updates): dt = hhmmss(deltaT(self.start_tim)) self.logger.info("[%s]: BellmanUpdates=%d, Return=%0.4f" % (dt, bellman_updates, perf_return))
def _solve_impl(self): """Solve the domain MDP.""" # Used to show the total time took the process self.start_time = clock() bellman_updates = 0 converged = False iteration = 0 # Track the number of consequent trajectories with very small observed # BellmanError converged_trajectories = 0 while self.has_time() and not converged: max_bellman_error = 0 step = 0 s, terminal, p_actions = self.domain.s0() # Generate a new episode e-greedy with the current values while not terminal and step < self.domain.episode_cap and self.has_time( ): a = self.eps_greedy(s, terminal, p_actions) bellman_error, phi_s, phi_s_a = self._bellman_error( s, a, terminal) # Update Parameters self.representation.weight_vec += self.alpha * bellman_error * phi_s_a bellman_updates += 1 step += 1 # Discover features if the representation has the discover method if hasattr(self.representation, "discover"): self.representation.post_discover(phi_s, bellman_error) max_bellman_error = max(max_bellman_error, abs(bellman_error)) # Simulate new state and action on trajectory _, s, terminal, p_actions = self.domain.step(a) # check for convergence iteration += 1 if max_bellman_error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f," "Steps=%d, Features=%d" % ( iteration, hhmmss(deltaT(self.start_time)), bellman_updates, max_bellman_error, perf_return, perf_steps, self.representation.features_num, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["bellman_updates"].append(bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["iteration"].append(iteration) if converged: self.logger.info("Converged!") self.log_value()