def run_value_iteration(self, solver, epoch): run_start_time = time.time() reward = 0 discounted_reward = 0 discount = 1.0 solver.value_iteration(self.model.get_transition_matrix(), self.model.get_observation_matrix(), self.model.get_reward_matrix(), self.model.planning_horizon) b = self.model.get_initial_belief_state() for i in range(self.model.max_steps): # TODO: record average V(b) per epoch action, v_b = solver.select_action(b, solver.gamma) step_result = self.model.generate_step(action) if not step_result.is_terminal: b = self.model.belief_update(b, action, step_result.observation) reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount # show the step result self.display_step_result(i, step_result) if step_result.is_terminal: console(3, module, 'Terminated after episode step ' + str(i + 1)) break # TODO: add belief state History sequence self.results.time.add(time.time() - run_start_time) self.results.update_reward_results(reward, discounted_reward) # Pretty Print results self.results.show(epoch) console( 3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return())) print_divider('medium') self.experiment_results.time.add(self.results.time.running_total) self.experiment_results.undiscounted_return.count += ( self.results.undiscounted_return.count - 1) self.experiment_results.undiscounted_return.add( self.results.undiscounted_return.running_total) self.experiment_results.discounted_return.count += ( self.results.discounted_return.count - 1) self.experiment_results.discounted_return.add( self.results.discounted_return.running_total)
def show(self): print "Displaying history sequence..." for entry in self.entry_sequence: print_divider("medium") print "id: ", entry.id print "action: ", entry.action.to_string() print "observation: ", entry.observation.to_string() print "next state: ", entry.state.to_string() print "reward: ", entry.reward time.sleep(2) # pause for 2 seconds
def show(self): print_divider("medium") print("\tDisplaying history sequence") for entry in self.entry_sequence: print_divider("medium") print("id: ", entry.id) print("action: ", entry.action.to_string()) print("observation: ", entry.observation.to_string()) print("next state: ", entry.state.to_string()) print("reward: ", entry.reward)
def show(self, epoch): print_divider('large') print('\tEpoch #' + str(epoch) + ' RESULTS') print_divider('large') console(2, module, 'discounted return statistics') print_divider('medium') self.discounted_return.show() print_divider('medium') console(2, module, 'undiscounted return statistics') print_divider('medium') self.undiscounted_return.show()
def display_step_result(step_num, step_result): """ Pretty prints step result information :param step_num: :param step_result: :return: """ print_divider("large") console(2, module, "Step Number = " + str(step_num)) console(2, module, "Step Result.Action = " + step_result.action.to_string()) console(2, module, "Step Result.Observation = " + step_result.observation.to_string()) console(2, module, "Step Result.Next_State = " + step_result.next_state.to_string()) console(2, module, "Step Result.Reward = " + str(step_result.reward))
def show(): print_divider("large") print "\tRUN RESULTS" print_divider("large") console(2, module, "Discounted Return statistics") print_divider("medium") Results.discounted_return.show() print_divider("medium") console(2, module, "Un-discounted Return statistics") print_divider("medium") Results.undiscounted_return.show() print_divider("medium") console(2, module, "Time") print_divider("medium") Results.time.show() print_divider("medium")
def run(self, num_steps=None): run_start_time = time.time() discount = 1.0 if num_steps is None: num_steps = self.model.sys_cfg["num_steps"] # Reset the running total for each statistic for this run self.results.reset_running_totals() # Create a new solver solver = self.solver_factory(self, self.model) # Perform sim behaviors that must done for each run self.model.reset_for_run() console( 2, module, "num of particles generated = " + str(solver.belief_tree.root.state_particles.__len__())) if solver.on_policy: solver.policy_iteration() # Monte-Carlo start state state = self.model.sample_an_init_state() console(2, module, "Initial search state: " + state.to_string()) for i in range(num_steps): start_time = time.time() # action will be of type Discrete Action action = solver.select_action() step_result, is_legal = self.model.generate_step(state, action) self.results.reward.add(step_result.reward) self.results.undiscounted_return.running_total += step_result.reward self.results.discounted_return.running_total += ( step_result.reward * discount) discount *= self.model.sys_cfg["discount"] state = step_result.next_state # show the step result display_step_result(i, step_result) if not step_result.is_terminal: solver.update(step_result) # Extend the history sequence new_hist_entry = solver.history.add_entry() new_hist_entry.reward = step_result.reward new_hist_entry.action = step_result.action new_hist_entry.observation = step_result.observation new_hist_entry.register_entry(new_hist_entry, None, step_result.next_state) if step_result.is_terminal: console(2, module, "Terminated after episode step " + str(i)) break console( 2, module, "MCTS step forward took " + str(time.time() - start_time) + " seconds") self.results.time.add(time.time() - run_start_time) self.results.discounted_return.add( self.results.discounted_return.running_total) self.results.undiscounted_return.add( self.results.undiscounted_return.running_total) # Pretty Print results print_divider("large") solver.history.show() self.results.show() console( 2, module, "Max possible total Un-discounted Return: " + str(self.model.get_max_undiscounted_return())) print_divider("medium")
def run_pomcp(self, epoch, eps): epoch_start = time.time() # Create a new solver solver = self.solver_factory(self) # Monte-Carlo start state state = solver.belief_tree_index.sample_particle() reward = 0 discounted_reward = 0 discount = 1.0 for i in range(self.model.max_steps): start_time = time.time() # action will be of type Discrete Action action = solver.select_eps_greedy_action(eps, start_time) # update epsilon if eps > self.model.epsilon_minimum: eps *= self.model.epsilon_decay step_result, is_legal = self.model.generate_step(state, action) reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state # show the step result self.display_step_result(i, step_result) if not step_result.is_terminal or not is_legal: solver.update(step_result) # Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) if step_result.is_terminal or not is_legal: console(3, module, 'Terminated after episode step ' + str(i + 1)) break self.results.time.add(time.time() - epoch_start) self.results.update_reward_results(reward, discounted_reward) # Pretty Print results # print_divider('large') solver.history.show() self.results.show(epoch) console( 3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return())) print_divider('medium') self.experiment_results.time.add(self.results.time.running_total) self.experiment_results.undiscounted_return.count += ( self.results.undiscounted_return.count - 1) self.experiment_results.undiscounted_return.add( self.results.undiscounted_return.running_total) self.experiment_results.discounted_return.count += ( self.results.discounted_return.count - 1) self.experiment_results.discounted_return.add( self.results.discounted_return.running_total) return eps
def run_pomcp(self, epoch, eps): epoch_start = time.time() # Create a new solver solver = self.solver_factory(self) # Monte-Carlo start state state = solver.belief_tree_index.sample_particle() # NOTE: rock example specific self.model.set_rock_states(state) reward = 0 discounted_reward = 0 discount = 1.0 solver.show_current_belief() for i in range(self.model.max_steps): start_time = time.time() # action will be of type Discrete Action action = solver.select_eps_greedy_action(eps, start_time) # print("selected action : " + str(action.bin_number)) # raw_input("Press Enter to continue...") # update epsilon if eps > self.model.epsilon_minimum: eps *= self.model.epsilon_decay step_result, is_legal = self.model.generate_step(state, action) reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state # show the step result self.display_step_result(i, step_result) if not step_result.is_terminal or not is_legal: # prune the tree and augment the child belief node to proceed with enough particles that match the current (a,o) solver.update(step_result) solver.show_current_belief() # Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) if step_result.is_terminal or not is_legal: console(3, module, 'Terminated after episode step ' + str(i + 1)) break self.results.time.add(time.time() - epoch_start) self.results.update_reward_results(reward, discounted_reward) # Pretty Print results # print_divider('large') solver.history.show() self.results.show(epoch) console(3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return())) print_divider('medium') self.experiment_results.time.add(self.results.time.running_total) self.experiment_results.undiscounted_return.count += (self.results.undiscounted_return.count - 1) self.experiment_results.undiscounted_return.add(self.results.undiscounted_return.running_total) self.experiment_results.discounted_return.count += (self.results.discounted_return.count - 1) self.experiment_results.discounted_return.add(self.results.discounted_return.running_total) return eps
def run_pomcp(self, epoch, eps): epoch_start = time.time() # Create a new solver, includes UCB, belief tree stuff solver = self.solver_factory( self ) # first build a belief tree, belief tree has many state particles # Monte-Carlo start state, random sample a start state state = solver.belief_tree_index.sample_particle() reward = 0 discounted_reward = 0 discount = 1.0 #the process of change root of belief tree, can think of it as building history(build tree by choose action) for i in range( self.model.max_steps ): #max_steps Max num of steps per trial/episode/trajectory/epoch, 200 start_time = time.time() # action will be of type Discrete Action, choose action after planning # this is the pomcp planner (plan), for one belief state, simulate 500 times action = solver.select_eps_greedy_action( eps, start_time) #eps is tree depth ######################################################################## # update epsilon if eps > self.model.epsilon_minimum: eps *= self.model.epsilon_decay # this is the execution stage(act and sense), # choose the action and actually execute, get next state and observation step_result, is_legal = self.model.generate_step(state, action) reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state print("inside step loop: {}".format(i)) # show the step result self.display_step_result(i, step_result) if not step_result.is_terminal or not is_legal: solver.update( step_result ) #update belief state and prune, the belief state tree gets changed # Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) if step_result.is_terminal or not is_legal: console(3, module, 'Terminated after episode step ' + str(i + 1)) break self.results.time.add(time.time() - epoch_start) self.results.update_reward_results(reward, discounted_reward) # Pretty Print results # print_divider('large') solver.history.show() self.results.show(epoch) console( 3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return())) print_divider('medium') self.experiment_results.time.add(self.results.time.running_total) self.experiment_results.undiscounted_return.count += ( self.results.undiscounted_return.count - 1) self.experiment_results.undiscounted_return.add( self.results.undiscounted_return.running_total) self.experiment_results.discounted_return.count += ( self.results.discounted_return.count - 1) self.experiment_results.discounted_return.add( self.results.discounted_return.running_total) return eps