def __init__(self, problem, steps): self.original_problem = deepcopy(problem) start_state, special_things = checker.problem_to_state(problem) self.steps = steps self.current_state_of_board, self.current_special_things = checker.problem_to_state( problem) self.eval = checker.Evaluator(0, problem, steps) self.act_list = ACT_LIST all_states, trans_dict, rewards = self.compute_states print(all_states) print(rewards) mdp.MDP.__init__(self, init=start_state, actlist=["U", "D", "R", "L"], terminals=[], transitions=trans_dict, states=all_states, gamma=0.01) self.reward = rewards #mpd rewards dictionary self.U = mdp.value_iteration(self) self.pi = mdp.best_policy(self, self.U) # print(mdp.best_policy(self, self.U)) print("end of initialization\n\n\n\n") return
def __init__(self, player, territories, start_state): self.State = make_State(territories) self.player = player self.territories = {t.name:t for t in territories} self.state = start_state self.turn = 0 if not self.player: init_state = self.state self.comp = mdp.QuestMDP(set(self.territories.values()), init_state) self.comp.generate_states() self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp)) print self.state
def take_turn(self): if self.player: actions = self.actions(self.state) print "Actions available:" for i,action in enumerate(actions): print '%i: %s' % (i, action) usr_input = None while usr_input not in [i for i in range(len(actions))]: usr_input = input("Action: ") self.do_action(actions[usr_input]) print action self.print_state() else: # time.sleep(3) if self.state not in self.policy: self.comp = mdp.QuestMDP(set(self.territories.values()), self.state) self.comp.generate_states() self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp)) # pdb.set_trace() action = self.policy[self.state] self.do_action(action) print action
def choose_next_action(self, state): state_of_board, special_things = checker.problem_to_state(state) eval_state = checker.Evaluator(0, state, 1) if not "pacman" in special_things: # check if PACMAN is still in the game return "reset" # if pacman is still in the game, then, choose best next step. s = self.eval_state_to_ab_state_plus_md(eval_state) if s in self.pi: new_min_md = 0 # check if we need to update R based on Ghost location: min_md = self.find_min_md_from_ghosts(eval_state) # we check if there any ghosts on the board, and if they are very close. if min_md != -100 and min_md <= 2: print("performing update to R") # start scanning for a better position for action in ["U", "L", "R", "D"]: child_eval = deepcopy(eval_state) checker.Evaluator.change_state_after_action( child_eval, action) temp_new_md = self.find_min_md_from_ghosts(child_eval) if temp_new_md != -100 and temp_new_md > new_min_md: new_min_md = temp_new_md next_state_md = self.eval_state_to_ab_state_plus_md( child_eval) self.rewards[next_state_md] = self.rewards[ next_state_md] + 10 * new_min_md # TODO: we might be yeilding a state that didnt exist before self.U = mdp.value_iteration(self) self.pi = mdp.best_policy(self, self.U) return self.pi[s] else: a = ["U", "D", "L", "R"] print("random chosen") # maybe here we should go into a simple dfs to find rest of the route to finish the board? @meir index = random.randint(0, 3) return a[index]
def partD(): policy = mdp.best_policy(chatbot_mdp, mdp.value_iteration(chatbot_mdp)) print('State: choice\n-----------------------') for s in chatbot_mdp.states: print(str(s) + ': ' + str(policy[s]))
def main(): number_of_iterations = 10 # expert_mdp = GridMDP([[-10, -5, 0, 0, 10], # [-5, -3, 0, 0, 0], # [0, 0, 0, 0, 0], # [0, 0, 0, 0, 0]], # terminals=[(4,3)]) # expert_mdp = GridMDP([[-10, -5, -3, -1, 0, 0, 0, 0, 0, 10], # [-8, -5, -3, 0, 0, 0, 0, 0, 0, 0], # [-5, -2, -1, 0, 0, 0, 0, 0, 0, 0], # [-3, -1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # terminals=[(9,4)]) # # expert_mdp = GridMDP([[0, 0, 0, 0, -1, -1, 0, 0, 0, 10], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, -3, -5, -5, -3, 0, 0, 0], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]], # terminals=[(9,4)]) # # rewards = [[0, 0, 0, 0, -1, -1, 0, 0, 0, 10], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, -3, -5, -5, -3, 0, 0, 0], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]] # rewards = [[0, 0, 0, 0, -8, -8, 0, 0, 0, 10], [0, 0, 0, -8, -10, -10, -8, 0, 0, 0], [0, 0, 0, -8, -10, -10, -8, 0, 0, 0], [0, 0, 0, -8, -10, -10, -8, 0, 0, 0], [0, 0, 0, 0, 0, -8, -8, 0, 0, 0]] # rewards = [[-6, -3, -1, 0, 0, 0, 0, 0, 0, 10], # [-3, -3, -1, 0, 0, 0, 0, 0, 0, 0], # [-1, -1, -1, 0, 0, 0, 0, -1, -1, -1], # [0, 0, 0, 0, 0, 0, 0, -1, -3, -3], # [0, 0, 0, 0, 0, 0, 0, -1, -3, -6]] # # rewards = [[0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0, 10], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0]] expert_mdp = mdp.GridMDP(rewards, terminals=[(9, 4)]) expert_trace = mdp.best_policy(expert_mdp, mdp.value_iteration(expert_mdp, 0.001)) print "Expert rewards:" expert_mdp.print_rewards() print "Expert policy:" utils.print_table(expert_mdp.to_arrows(expert_trace)) print "---------------" expert_trace.pop((0,1)) expert_trace.pop((0,2)) expert_trace.pop((0,3)) mybirl = birl.BIRL(expert_trace, expert_mdp.get_grid_size(), expert_mdp.terminals, partial(calculate_error_sum, expert_mdp), birl_iteration=2, step_size=1.0) run_multiple_birl(mybirl, expert_mdp, expert_trace, number_of_iterations)
table = [header] + table table = [[(numfmt % x if isnumber(x) else x) for x in row] for row in table] maxlen = lambda seq: max(map(len, seq)) sizes = map(maxlen, zip(*[map(str, row) for row in table])) for row in table: print sep.join(getattr(str(x), j)(size) for (j, size, x) in zip(justs, sizes, row)) prize = 1 trap = -1 neg = -0.4 mdp1 = GridMDP([[neg, trap, prize], [neg, None, neg], [neg, neg, neg]], terminals=[(1, 2), (2, 2)], error=.8) print "GRID" print print "Value iteration" pi = best_policy(mdp1, value_iteration(mdp1, .01)) print_table(mdp1.to_arrows(pi)) print "Policy iteration" print_table(mdp1.to_arrows(policy_iteration(mdp1))) print "Q Learning" pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000)) print_table(mdp1.to_arrows(pi))