Python best_policy示例，mdp.best_policy Python示例

示例#1

0

显示文件

文件： ex2.py 项目： ApplyHiTech/ml_hw2

    def __init__(self, problem, steps):
        self.original_problem = deepcopy(problem)
        start_state, special_things = checker.problem_to_state(problem)
        self.steps = steps
        self.current_state_of_board, self.current_special_things = checker.problem_to_state(
            problem)
        self.eval = checker.Evaluator(0, problem, steps)
        self.act_list = ACT_LIST
        all_states, trans_dict, rewards = self.compute_states
        print(all_states)
        print(rewards)
        mdp.MDP.__init__(self,
                         init=start_state,
                         actlist=["U", "D", "R", "L"],
                         terminals=[],
                         transitions=trans_dict,
                         states=all_states,
                         gamma=0.01)

        self.reward = rewards  #mpd rewards dictionary

        self.U = mdp.value_iteration(self)

        self.pi = mdp.best_policy(self, self.U)

        # print(mdp.best_policy(self, self.U))
        print("end of initialization\n\n\n\n")
        return

示例#2

0

显示文件

文件： game.py 项目： davidbenhaim/pyquest

 def __init__(self, player, territories, start_state):
     self.State = make_State(territories)
     self.player = player
     self.territories = {t.name:t for t in territories}
     self.state = start_state
     self.turn = 0
     if not self.player:
         init_state = self.state
         self.comp = mdp.QuestMDP(set(self.territories.values()), init_state)
         self.comp.generate_states()
         self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp))
         print self.state

示例#3

0

显示文件

文件： game.py 项目： davidbenhaim/pyquest

 def take_turn(self):
     if self.player:
         actions = self.actions(self.state)
         print "Actions available:"
         for i,action in enumerate(actions):
             print '%i: %s' % (i, action)
         usr_input = None
         while usr_input not in [i for i in range(len(actions))]:
             usr_input = input("Action: ")
         self.do_action(actions[usr_input])
         print action
         self.print_state()
     else:
         # time.sleep(3)
         if self.state not in self.policy:
             self.comp = mdp.QuestMDP(set(self.territories.values()), self.state)
             self.comp.generate_states()
             self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp))
         # pdb.set_trace()
         action = self.policy[self.state]
         self.do_action(action)
         print action

示例#4

0

显示文件

文件： ex2.py 项目： ApplyHiTech/ml_hw2

 def choose_next_action(self, state):
     state_of_board, special_things = checker.problem_to_state(state)
     eval_state = checker.Evaluator(0, state, 1)
     if not "pacman" in special_things:
         # check if PACMAN is still in the game
         return "reset"
     # if pacman is still in the game, then, choose best next step.
     s = self.eval_state_to_ab_state_plus_md(eval_state)
     if s in self.pi:
         new_min_md = 0
         # check if we need to update R based on Ghost location:
         min_md = self.find_min_md_from_ghosts(eval_state)
         # we check if there any ghosts on the board, and if they are very close.
         if min_md != -100 and min_md <= 2:
             print("performing update to R")
             # start scanning for a better position
             for action in ["U", "L", "R", "D"]:
                 child_eval = deepcopy(eval_state)
                 checker.Evaluator.change_state_after_action(
                     child_eval, action)
                 temp_new_md = self.find_min_md_from_ghosts(child_eval)
                 if temp_new_md != -100 and temp_new_md > new_min_md:
                     new_min_md = temp_new_md
                     next_state_md = self.eval_state_to_ab_state_plus_md(
                         child_eval)
                     self.rewards[next_state_md] = self.rewards[
                         next_state_md] + 10 * new_min_md
             # TODO: we might be yeilding a state that didnt exist before
             self.U = mdp.value_iteration(self)
             self.pi = mdp.best_policy(self, self.U)
         return self.pi[s]
     else:
         a = ["U", "D", "L", "R"]
         print("random chosen")
         # maybe here we should go into a simple dfs to find rest of the route to finish the board? @meir
         index = random.randint(0, 3)
         return a[index]

示例#5

0

显示文件

文件： a3.py 项目： zpixley/Portfolio

def partD():
    policy = mdp.best_policy(chatbot_mdp, mdp.value_iteration(chatbot_mdp))
    print('State: choice\n-----------------------')
    for s in chatbot_mdp.states:
        print(str(s) + ': ' + str(policy[s]))

示例#6

0

显示文件

def main():
    number_of_iterations = 10

    # expert_mdp = GridMDP([[-10, -5, 0, 0, 10],
    #         [-5, -3, 0, 0, 0],
    #         [0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0]],
    #         terminals=[(4,3)])

    # expert_mdp = GridMDP([[-10, -5, -3, -1, 0, 0, 0, 0, 0, 10],
    #         [-8, -5, -3, 0, 0, 0, 0, 0, 0, 0],
    #         [-5, -2, -1, 0, 0, 0, 0, 0, 0, 0],
    #         [-3, -1, 0, 0, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
    #         terminals=[(9,4)])
    #
    # expert_mdp = GridMDP([[0, 0, 0, 0, -1, -1, 0, 0, 0, 10],
    #                     [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #                     [0, 0, 0, -3, -5, -5, -3, 0, 0, 0],
    #                     [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #                     [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]],
    #                     terminals=[(9,4)])
    #
    # rewards = [[0, 0, 0, 0, -1, -1, 0, 0, 0, 10],
    #            [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #            [0, 0, 0, -3, -5, -5, -3, 0, 0, 0],
    #            [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #            [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]]
    #

    rewards = [[0, 0, 0, 0, -8, -8, 0, 0, 0, 10],
               [0, 0, 0, -8, -10, -10, -8, 0, 0, 0],
               [0, 0, 0, -8, -10, -10, -8, 0, 0, 0],
               [0, 0, 0, -8, -10, -10, -8, 0, 0, 0],
               [0, 0, 0, 0, 0, -8, -8, 0, 0, 0]]

    # rewards = [[-6, -3, -1, 0, 0, 0, 0, 0, 0, 10],
    #             [-3, -3, -1, 0, 0, 0, 0, 0, 0, 0],
    #             [-1, -1, -1, 0, 0, 0, 0, -1, -1, -1],
    #             [0, 0, 0, 0, 0, 0, 0, -1, -3, -3],
    #             [0, 0, 0, 0, 0, 0, 0, -1, -3, -6]]
    #
    # rewards = [[0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0, 10],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0]]



    expert_mdp = mdp.GridMDP(rewards,
                         terminals=[(9, 4)])

    expert_trace = mdp.best_policy(expert_mdp, mdp.value_iteration(expert_mdp, 0.001))
    print "Expert rewards:"
    expert_mdp.print_rewards()
    print "Expert policy:"
    utils.print_table(expert_mdp.to_arrows(expert_trace))
    print "---------------"

    expert_trace.pop((0,1))
    expert_trace.pop((0,2))
    expert_trace.pop((0,3))


    mybirl = birl.BIRL(expert_trace, expert_mdp.get_grid_size(), expert_mdp.terminals,
                partial(calculate_error_sum, expert_mdp), birl_iteration=2, step_size=1.0)
    run_multiple_birl(mybirl, expert_mdp, expert_trace, number_of_iterations)

示例#7

0

显示文件

    table = [header] + table
  table = [[(numfmt % x if isnumber(x) else x) for x in row]
           for row in table]
  maxlen = lambda seq: max(map(len, seq))
  sizes = map(maxlen, zip(*[map(str, row) for row in table]))
  for row in table:
    print sep.join(getattr(str(x), j)(size)
                   for (j, size, x) in zip(justs, sizes, row))

prize = 1
trap = -1
neg = -0.4

mdp1 = GridMDP([[neg, trap, prize],
                [neg, None, neg],
                [neg,  neg, neg]],
                terminals=[(1, 2), (2, 2)],
                error=.8)

print "GRID"
print
print "Value iteration"
pi = best_policy(mdp1, value_iteration(mdp1, .01))
print_table(mdp1.to_arrows(pi))

print "Policy iteration"
print_table(mdp1.to_arrows(policy_iteration(mdp1)))

print "Q Learning"
pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000))
print_table(mdp1.to_arrows(pi))