예제 #1
0
def dicounted_return(self):
    eps = self.model.epsilon_start
    self.model.reset_for_epoch()
    start_new_timer = True
    LOGFILE = defaultdict(list)
    for i in tqdm(range(self.model.n_epochs)):
        solver = self.solver_factory(self)
        state = solver.belief_tree_index.sample_particle()
        reward = 0
        safety_property = 0
        discounted_reward = 0
        discount = 1.0
        if (start_new_timer):
            begin_time = time.time()
            start_new_timer = False
        # shuffle_obstacles(self, i)

        traj = []
        for _ in range(self.model.max_steps):
            start_time = time.time()
            # state_visualization(self, state.position)
            action = solver.select_eps_greedy_action(eps, start_time)
            step_result, is_legal = self.model.generate_step(state, action)

            # print(f"reward {step_result.reward}, is_legal {is_legal}, obs {step_result.observation} ")
            #STEP update properties and reward
            safety_property += 1 if step_result.observation.is_obstacle or not is_legal else 0
            reward += step_result.reward
            discounted_reward += discount * step_result.reward
            discount *= self.model.discount
            state = step_result.next_state
            traj.append(state.position)

            # STEP model update
            if not step_result.is_terminal or not is_legal or not self.model.is_obstacle(
                    state.position):
                solver.update(step_result)

            # STEP Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            # STEP termination
            if step_result.is_terminal or not is_legal:
                print('Terminated after episode step ' + str(i + 1))
                break
        # STEP- find out the illegal actions given the current state?
        elapsed_time = time.time() - begin_time
        # print(f" time {elapsed_time:3.3} state {state.position} reward {reward}, prob {safety_property/self.model.max_steps:.3} count {safety_property}")
        self.model.reset_for_epoch()
        #STEP perform model checking
        y_size = self.model.n_rows
        map_index = lambda pos: pos.i * y_size + pos.j
        start_new_timer = ModelChecker.check_safe_reachability(
            map(map_index, traj), map(map_index,
                                      self.model.obstacle_positions),
            map(map_index, self.model.goal_positions))
        # STEP log writing
        LOGFILE["reward"].append(reward)
        LOGFILE["time"].append(elapsed_time)
        LOGFILE["sat"].append(start_new_timer)
    dtf = pd.DataFrame(LOGFILE)
    dtf.to_csv(EXP_LOG + 'map-14-4-6_3000' + '.csv')
예제 #2
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver
        solver = self.solver_factory(self)

        # Monte-Carlo start state
        state = solver.belief_tree_index.sample_particle()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        for i in range(self.model.max_steps):

            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_eps_greedy_action(eps, start_time)

            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                solver.update(step_result)

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)

        return eps
예제 #3
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver, includes UCB, belief tree stuff
        solver = self.solver_factory(
            self
        )  # first build a belief tree, belief tree has many state particles

        # Monte-Carlo start state, random sample a start state
        state = solver.belief_tree_index.sample_particle()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        #the process of change root of belief tree, can think of it as building history(build tree by choose action)
        for i in range(
                self.model.max_steps
        ):  #max_steps Max num of steps per trial/episode/trajectory/epoch, 200

            start_time = time.time()

            # action will be of type Discrete Action, choose action after planning
            # this is the pomcp planner (plan), for one belief state, simulate 500 times
            action = solver.select_eps_greedy_action(
                eps, start_time)  #eps is tree depth

            ########################################################################
            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            # this is the execution stage(act and sense),
            # choose the action and actually execute, get next state and observation
            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            print("inside step loop: {}".format(i))
            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                solver.update(
                    step_result
                )  #update belief state  and prune, the belief state tree gets changed

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)

        return eps
예제 #4
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver
        solver = self.solver_factory(self)

        # Monte-Carlo start state
        state = solver.belief_tree_index.sample_particle()

        # NOTE: rock example specific
        self.model.set_rock_states(state)

        reward = 0
        discounted_reward = 0
        discount = 1.0

        solver.show_current_belief()

        for i in range(self.model.max_steps):

            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_eps_greedy_action(eps, start_time)

            # print("selected action : " + str(action.bin_number))
            # raw_input("Press Enter to continue...")

            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                # prune the tree and augment the child belief node to proceed with enough particles that match the current (a,o)
                solver.update(step_result)
            
            solver.show_current_belief()

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry, step_result.reward,
                                              step_result.action, step_result.observation, step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module, 'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(self.results.discounted_return.running_total)

        return eps
예제 #5
0
def dicounted_return(self, thread_id, target):
    eps = self.model.epsilon_start
    self.model.reset_for_epoch()
    start_new_timer = True
    epochs = self.model.n_epochs
    for i in range(epochs):
        solver = self.solver_factory(self)
        state = solver.belief_tree_index.sample_particle()
        reward = 0
        safety_property = 0
        discounted_reward = 0
        discount = 1.0
        if (start_new_timer):
            begin_time = time.time()
            start_new_timer = False
        traj = []
        for _ in range(self.model.max_steps):
            start_time = time.time()
            action = solver.select_eps_greedy_action(eps, start_time)
            step_result, is_legal = self.model.generate_step(state, action)
            #STEP update properties and reward
            safety_property += 1 if step_result.observation.is_obstacle or not is_legal else 0
            reward += step_result.reward
            discounted_reward += discount * step_result.reward
            discount *= self.model.discount
            state = step_result.next_state
            traj.append(state.position)

            # STEP model update
            if not step_result.is_terminal or not is_legal or not self.model.is_obstacle(
                    state.position):
                solver.update(step_result)

            # STEP Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            # STEP termination
            if step_result.is_terminal or not is_legal:
                print('Terminated after episode step ' + str(i + 1))
                break
        # STEP- find out the illegal actions given the current state?
        elapsed_time = time.time() - begin_time
        # print(f" time {elapsed_time:3.3} state {state.position} reward {reward}, prob {safety_property/self.model.max_steps:.3} count {safety_property}")
        self.model.reset_for_epoch()
        #STEP perform model checking
        sat = False
        try:
            y_size = self.model.n_rows
            map_index = lambda pos: pos.i * y_size + pos.j
            sat = ModelChecker.check_safe_reachability(
                map(map_index, traj),
                map(map_index, self.model.obstacle_positions),
                map(map_index, self.model.goal_positions))
        except:
            print("exception for sat solver")
        finally:
            if ((reward <= target and sat) or elapsed_time > TIMEOUT):
                # print(f"Target Found with {elapsed_time:.3f} sec")
                return {"time": elapsed_time, "sat": sat}