def dicounted_return(self): eps = self.model.epsilon_start self.model.reset_for_epoch() start_new_timer = True LOGFILE = defaultdict(list) for i in tqdm(range(self.model.n_epochs)): solver = self.solver_factory(self) state = solver.belief_tree_index.sample_particle() reward = 0 safety_property = 0 discounted_reward = 0 discount = 1.0 if (start_new_timer): begin_time = time.time() start_new_timer = False # shuffle_obstacles(self, i) traj = [] for _ in range(self.model.max_steps): start_time = time.time() # state_visualization(self, state.position) action = solver.select_eps_greedy_action(eps, start_time) step_result, is_legal = self.model.generate_step(state, action) # print(f"reward {step_result.reward}, is_legal {is_legal}, obs {step_result.observation} ") #STEP update properties and reward safety_property += 1 if step_result.observation.is_obstacle or not is_legal else 0 reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state traj.append(state.position) # STEP model update if not step_result.is_terminal or not is_legal or not self.model.is_obstacle( state.position): solver.update(step_result) # STEP Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) # STEP termination if step_result.is_terminal or not is_legal: print('Terminated after episode step ' + str(i + 1)) break # STEP- find out the illegal actions given the current state? elapsed_time = time.time() - begin_time # print(f" time {elapsed_time:3.3} state {state.position} reward {reward}, prob {safety_property/self.model.max_steps:.3} count {safety_property}") self.model.reset_for_epoch() #STEP perform model checking y_size = self.model.n_rows map_index = lambda pos: pos.i * y_size + pos.j start_new_timer = ModelChecker.check_safe_reachability( map(map_index, traj), map(map_index, self.model.obstacle_positions), map(map_index, self.model.goal_positions)) # STEP log writing LOGFILE["reward"].append(reward) LOGFILE["time"].append(elapsed_time) LOGFILE["sat"].append(start_new_timer) dtf = pd.DataFrame(LOGFILE) dtf.to_csv(EXP_LOG + 'map-14-4-6_3000' + '.csv')
def run_pomcp(self, epoch, eps): epoch_start = time.time() # Create a new solver solver = self.solver_factory(self) # Monte-Carlo start state state = solver.belief_tree_index.sample_particle() reward = 0 discounted_reward = 0 discount = 1.0 for i in range(self.model.max_steps): start_time = time.time() # action will be of type Discrete Action action = solver.select_eps_greedy_action(eps, start_time) # update epsilon if eps > self.model.epsilon_minimum: eps *= self.model.epsilon_decay step_result, is_legal = self.model.generate_step(state, action) reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state # show the step result self.display_step_result(i, step_result) if not step_result.is_terminal or not is_legal: solver.update(step_result) # Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) if step_result.is_terminal or not is_legal: console(3, module, 'Terminated after episode step ' + str(i + 1)) break self.results.time.add(time.time() - epoch_start) self.results.update_reward_results(reward, discounted_reward) # Pretty Print results # print_divider('large') solver.history.show() self.results.show(epoch) console( 3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return())) print_divider('medium') self.experiment_results.time.add(self.results.time.running_total) self.experiment_results.undiscounted_return.count += ( self.results.undiscounted_return.count - 1) self.experiment_results.undiscounted_return.add( self.results.undiscounted_return.running_total) self.experiment_results.discounted_return.count += ( self.results.discounted_return.count - 1) self.experiment_results.discounted_return.add( self.results.discounted_return.running_total) return eps
def run_pomcp(self, epoch, eps): epoch_start = time.time() # Create a new solver, includes UCB, belief tree stuff solver = self.solver_factory( self ) # first build a belief tree, belief tree has many state particles # Monte-Carlo start state, random sample a start state state = solver.belief_tree_index.sample_particle() reward = 0 discounted_reward = 0 discount = 1.0 #the process of change root of belief tree, can think of it as building history(build tree by choose action) for i in range( self.model.max_steps ): #max_steps Max num of steps per trial/episode/trajectory/epoch, 200 start_time = time.time() # action will be of type Discrete Action, choose action after planning # this is the pomcp planner (plan), for one belief state, simulate 500 times action = solver.select_eps_greedy_action( eps, start_time) #eps is tree depth ######################################################################## # update epsilon if eps > self.model.epsilon_minimum: eps *= self.model.epsilon_decay # this is the execution stage(act and sense), # choose the action and actually execute, get next state and observation step_result, is_legal = self.model.generate_step(state, action) reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state print("inside step loop: {}".format(i)) # show the step result self.display_step_result(i, step_result) if not step_result.is_terminal or not is_legal: solver.update( step_result ) #update belief state and prune, the belief state tree gets changed # Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) if step_result.is_terminal or not is_legal: console(3, module, 'Terminated after episode step ' + str(i + 1)) break self.results.time.add(time.time() - epoch_start) self.results.update_reward_results(reward, discounted_reward) # Pretty Print results # print_divider('large') solver.history.show() self.results.show(epoch) console( 3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return())) print_divider('medium') self.experiment_results.time.add(self.results.time.running_total) self.experiment_results.undiscounted_return.count += ( self.results.undiscounted_return.count - 1) self.experiment_results.undiscounted_return.add( self.results.undiscounted_return.running_total) self.experiment_results.discounted_return.count += ( self.results.discounted_return.count - 1) self.experiment_results.discounted_return.add( self.results.discounted_return.running_total) return eps
def run_pomcp(self, epoch, eps): epoch_start = time.time() # Create a new solver solver = self.solver_factory(self) # Monte-Carlo start state state = solver.belief_tree_index.sample_particle() # NOTE: rock example specific self.model.set_rock_states(state) reward = 0 discounted_reward = 0 discount = 1.0 solver.show_current_belief() for i in range(self.model.max_steps): start_time = time.time() # action will be of type Discrete Action action = solver.select_eps_greedy_action(eps, start_time) # print("selected action : " + str(action.bin_number)) # raw_input("Press Enter to continue...") # update epsilon if eps > self.model.epsilon_minimum: eps *= self.model.epsilon_decay step_result, is_legal = self.model.generate_step(state, action) reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state # show the step result self.display_step_result(i, step_result) if not step_result.is_terminal or not is_legal: # prune the tree and augment the child belief node to proceed with enough particles that match the current (a,o) solver.update(step_result) solver.show_current_belief() # Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) if step_result.is_terminal or not is_legal: console(3, module, 'Terminated after episode step ' + str(i + 1)) break self.results.time.add(time.time() - epoch_start) self.results.update_reward_results(reward, discounted_reward) # Pretty Print results # print_divider('large') solver.history.show() self.results.show(epoch) console(3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return())) print_divider('medium') self.experiment_results.time.add(self.results.time.running_total) self.experiment_results.undiscounted_return.count += (self.results.undiscounted_return.count - 1) self.experiment_results.undiscounted_return.add(self.results.undiscounted_return.running_total) self.experiment_results.discounted_return.count += (self.results.discounted_return.count - 1) self.experiment_results.discounted_return.add(self.results.discounted_return.running_total) return eps
def dicounted_return(self, thread_id, target): eps = self.model.epsilon_start self.model.reset_for_epoch() start_new_timer = True epochs = self.model.n_epochs for i in range(epochs): solver = self.solver_factory(self) state = solver.belief_tree_index.sample_particle() reward = 0 safety_property = 0 discounted_reward = 0 discount = 1.0 if (start_new_timer): begin_time = time.time() start_new_timer = False traj = [] for _ in range(self.model.max_steps): start_time = time.time() action = solver.select_eps_greedy_action(eps, start_time) step_result, is_legal = self.model.generate_step(state, action) #STEP update properties and reward safety_property += 1 if step_result.observation.is_obstacle or not is_legal else 0 reward += step_result.reward discounted_reward += discount * step_result.reward discount *= self.model.discount state = step_result.next_state traj.append(state.position) # STEP model update if not step_result.is_terminal or not is_legal or not self.model.is_obstacle( state.position): solver.update(step_result) # STEP Extend the history sequence new_hist_entry = solver.history.add_entry() HistoryEntry.update_history_entry(new_hist_entry, step_result.reward, step_result.action, step_result.observation, step_result.next_state) # STEP termination if step_result.is_terminal or not is_legal: print('Terminated after episode step ' + str(i + 1)) break # STEP- find out the illegal actions given the current state? elapsed_time = time.time() - begin_time # print(f" time {elapsed_time:3.3} state {state.position} reward {reward}, prob {safety_property/self.model.max_steps:.3} count {safety_property}") self.model.reset_for_epoch() #STEP perform model checking sat = False try: y_size = self.model.n_rows map_index = lambda pos: pos.i * y_size + pos.j sat = ModelChecker.check_safe_reachability( map(map_index, traj), map(map_index, self.model.obstacle_positions), map(map_index, self.model.goal_positions)) except: print("exception for sat solver") finally: if ((reward <= target and sat) or elapsed_time > TIMEOUT): # print(f"Target Found with {elapsed_time:.3f} sec") return {"time": elapsed_time, "sat": sat}