def demoQLearningAgent(): print '--------------------' print 'DEMO QLearningAgent' print '--------------------' # Setup values policy = { (0, 1): (0, 1), (1, 2): (1, 0), (3, 2): None, (0, 0): (0, 1), (3, 0): (-1, 0), (3, 1): None, (2, 1): (0, 1), (2, 0): (0, 1), (2, 2): (1, 0), (1, 0): (1, 0), (0, 2): (1, 0) } time_start = time() trials = 100 agent = QLearningAgent(Fig[17, 1]) for i in range(0, trials): execute_trial(agent, Fig[17, 1]) time_end = time() print 'Executed %i trials' % trials print 'Took %d seconds' % (time_end - time_start) print 'Utilities: %s' % {s: max(agent.Q[s].values()) for s in agent.Q} print '\nCorrect Utilities (estimated by value iteration):' print value_iteration(Fig[17, 1])
def main(): board = Board() board.print() robot = Robot(board) value_iteration(board, robot) determine_policy(board)
def demoPassiveTDAgent(): print '--------------------' print 'DEMO PassiveTDAgent' print '--------------------' # Setup values policy = {(0, 1): (0, 1), (1, 2): (1, 0), (3, 2): None, (0, 0): (0, 1), (3, 0): (-1, 0), (3, 1): None, (2, 1): (0, 1), (2, 0): (0, 1), (2, 2): (1, 0), (1, 0): (1, 0), (0, 2): (1, 0)} time_start = time() trials = 100 agent = PassiveTDAgent(Fig[17,1], policy) for i in range (0,trials): execute_trial(agent,Fig[17,1]) time_end = time() print 'Executed %i trials' % trials print 'Took %d seconds' % (time_end - time_start) print 'Utilities: %s' % agent.U print '\nCorrect Utilities (estimated by value iteration):' print value_iteration(Fig[17,1])
def main(): g = Grid(4, 4) terminals = [{ "x": 3, "y": 0, "reward": 1 }, { "x": 1, "y": 3, "reward": 1 }, { "x": 2, "y": 3, "reward": -10 }, { "x": 3, "y": 3, "reward": 10 }] blocks = [{"x": 1, "y": 1}] g.init_world(terminals, blocks) np.random.seed(62) mdp.value_iteration(g, -0.02, 0.8, 0.8) mdp.policy_iteration(g, -0.02, 0.8, 0.8) mdp.q_function(g, "s6", -0.02, 0.8, 0.1, 0.1, 0.8, 1000000)
def demoPassiveTDAgent(): print '--------------------' print 'DEMO PassiveTDAgent' print '--------------------' # Setup values policy = { (0, 1): (0, 1), (1, 2): (1, 0), (3, 2): None, (0, 0): (0, 1), (3, 0): (-1, 0), (3, 1): None, (2, 1): (0, 1), (2, 0): (0, 1), (2, 2): (1, 0), (1, 0): (1, 0), (0, 2): (1, 0) } time_start = time() trials = 100 agent = PassiveTDAgent(Fig[17, 1], policy) for i in range(0, trials): execute_trial(agent, Fig[17, 1]) time_end = time() print 'Executed %i trials' % trials print 'Took %d seconds' % (time_end - time_start) print 'Utilities: %s' % agent.U print '\nCorrect Utilities (estimated by value iteration):' print value_iteration(Fig[17, 1])
def demoQLearningAgent(): print '--------------------' print 'DEMO QLearningAgent' print '--------------------' # Setup values policy = {(0, 1): (0, 1), (1, 2): (1, 0), (3, 2): None, (0, 0): (0, 1), (3, 0): (-1, 0), (3, 1): None, (2, 1): (0, 1), (2, 0): (0, 1), (2, 2): (1, 0), (1, 0): (1, 0), (0, 2): (1, 0)} time_start = time() trials = 100 agent = QLearningAgent(Fig[17,1]) for i in range (0,trials): execute_trial(agent,Fig[17,1]) time_end = time() print 'Executed %i trials' % trials print 'Took %d seconds' % (time_end - time_start) print 'Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q} print '\nCorrect Utilities (estimated by value iteration):' print value_iteration(Fig[17,1])
def policy(mazey, terminal = True): #call value iteration here value_iteration(mazey, terminal) for i in range(mazey.size): for j in range(mazey.size): policy = max_of_neighbors(mazey, i, j) if terminal and mazey.grid[i][j].is_terminal(): policy = 't' if mazey.grid[i][j].is_wall(): policy = 'w' mazey.grid[i][j].policy = policy
def __init__(self, problem, steps): self.original_problem = deepcopy(problem) start_state, special_things = checker.problem_to_state(problem) self.steps = steps self.current_state_of_board, self.current_special_things = checker.problem_to_state( problem) self.eval = checker.Evaluator(0, problem, steps) self.act_list = ACT_LIST all_states, trans_dict, rewards = self.compute_states print(all_states) print(rewards) mdp.MDP.__init__(self, init=start_state, actlist=["U", "D", "R", "L"], terminals=[], transitions=trans_dict, states=all_states, gamma=0.01) self.reward = rewards #mpd rewards dictionary self.U = mdp.value_iteration(self) self.pi = mdp.best_policy(self, self.U) # print(mdp.best_policy(self, self.U)) print("end of initialization\n\n\n\n") return
def solve(self, episodes=200, iterations=200, reset=True, seed=False, gamma=0.95): mdp = EnvMDP(self.env, gamma=gamma) self.policy = policy_iteration(mdp) self.U = value_iteration(mdp, epsilon=0.000000000001)
def __init__(self, player, territories, start_state): self.State = make_State(territories) self.player = player self.territories = {t.name:t for t in territories} self.state = start_state self.turn = 0 if not self.player: init_state = self.state self.comp = mdp.QuestMDP(set(self.territories.values()), init_state) self.comp.generate_states() self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp)) print self.state
def _value_iteration_slow(self): old_values = dict(self.mdp.values) for i in range(100): values = value_iteration(self.mdp.values, self.mdp, num_iter=1) policy = policy_extraction(values, self.mdp) self.gridworldwindow.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policies(policy) self.gridworldwindow.window.update() time.sleep(0.25) self.gridworldwindow.window.update() new_values = dict(values) if values_converged(new_values, old_values): break old_values = new_values self.gridworldwindow.show_dialog('Value Iteration has converged in {} steps!'.format(i+1))
def take_turn(self): if self.player: actions = self.actions(self.state) print "Actions available:" for i,action in enumerate(actions): print '%i: %s' % (i, action) usr_input = None while usr_input not in [i for i in range(len(actions))]: usr_input = input("Action: ") self.do_action(actions[usr_input]) print action self.print_state() else: # time.sleep(3) if self.state not in self.policy: self.comp = mdp.QuestMDP(set(self.territories.values()), self.state) self.comp.generate_states() self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp)) # pdb.set_trace() action = self.policy[self.state] self.do_action(action) print action
def choose_next_action(self, state): state_of_board, special_things = checker.problem_to_state(state) eval_state = checker.Evaluator(0, state, 1) if not "pacman" in special_things: # check if PACMAN is still in the game return "reset" # if pacman is still in the game, then, choose best next step. s = self.eval_state_to_ab_state_plus_md(eval_state) if s in self.pi: new_min_md = 0 # check if we need to update R based on Ghost location: min_md = self.find_min_md_from_ghosts(eval_state) # we check if there any ghosts on the board, and if they are very close. if min_md != -100 and min_md <= 2: print("performing update to R") # start scanning for a better position for action in ["U", "L", "R", "D"]: child_eval = deepcopy(eval_state) checker.Evaluator.change_state_after_action( child_eval, action) temp_new_md = self.find_min_md_from_ghosts(child_eval) if temp_new_md != -100 and temp_new_md > new_min_md: new_min_md = temp_new_md next_state_md = self.eval_state_to_ab_state_plus_md( child_eval) self.rewards[next_state_md] = self.rewards[ next_state_md] + 10 * new_min_md # TODO: we might be yeilding a state that didnt exist before self.U = mdp.value_iteration(self) self.pi = mdp.best_policy(self, self.U) return self.pi[s] else: a = ["U", "D", "L", "R"] print("random chosen") # maybe here we should go into a simple dfs to find rest of the route to finish the board? @meir index = random.randint(0, 3) return a[index]
table = [header] + table table = [[(numfmt % x if isnumber(x) else x) for x in row] for row in table] maxlen = lambda seq: max(map(len, seq)) sizes = map(maxlen, zip(*[map(str, row) for row in table])) for row in table: print sep.join(getattr(str(x), j)(size) for (j, size, x) in zip(justs, sizes, row)) prize = 1 trap = -1 neg = -0.4 mdp1 = GridMDP([[neg, trap, prize], [neg, None, neg], [neg, neg, neg]], terminals=[(1, 2), (2, 2)], error=.8) print "GRID" print print "Value iteration" pi = best_policy(mdp1, value_iteration(mdp1, .01)) print_table(mdp1.to_arrows(pi)) print "Policy iteration" print_table(mdp1.to_arrows(policy_iteration(mdp1))) print "Q Learning" pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000)) print_table(mdp1.to_arrows(pi))
def partD(): policy = mdp.best_policy(chatbot_mdp, mdp.value_iteration(chatbot_mdp)) print('State: choice\n-----------------------') for s in chatbot_mdp.states: print(str(s) + ': ' + str(policy[s]))
for mdp_grid, term_grid in unique_mdps: print("--"*10) state_features = mdp_grid terminals = mdp_gen.get_terminals_from_grid(term_grid) #print("state features\n",state_features) state_features = mdp_gen.categorical_to_one_hot_features(state_features, num_features) print('one hot features', state_features) world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) mdp_family.append(world) #plot for visualization all_opts = [] all_features = [] for i,mdp_env in enumerate(mdp_family): V = mdp.value_iteration(mdp_env, epsilon=precision) Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision) opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=precision) print(opt_policy) print(mdp_env.features) all_opts.append(opt_policy) all_features.append(mdp_env.features) #input() filename = "./data_analysis/figs/twoXtwo/firstthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3], all_features[:3], 1, 3, filename=filename) filename = "./data_analysis/figs/twoXtwo/lastthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:], all_features[-3:], 1, 3, filename=filename) #plt.show() family_teacher = machine_teaching.MdpFamilyTeacher(mdp_family, precision, debug) mdp_set_cover = family_teacher.get_machine_teaching_mdps()
def mdpProblem(conversationLength): if conversationLength == 'short': t = { "D1": { "Respond-Resolved": [(0.30, "U1")], "Respond-notResolved": [(0.70, "D2")], "Redirect-Frustrated": [(0.20, "U2")], "Redirect-notFrustrated": [(0.80, "U3")] }, "D2": { "Respond-Resolved": [(0.30, "U5")], "Respond-notResolved": [(0.70, "U4")], "Redirect-Frustrated": [(0.20, "U6")], "Redirect-notFrustrated": [(0.80, "U7")] } } elif conversationLength == 'medium': t = { "D1": { "Respond-Resolved": [(0.50, "U1")], "Respond-notResolved": [(0.50, "D2")], "Redirect-Frustrated": [(0.30, "U2")], "Redirect-notFrustrated": [(0.70, "U3")] }, "D2": { "Respond-Resolved": [(0.50, "U5")], "Respond-notResolved": [(0.50, "U4")], "Redirect-Frustrated": [(0.30, "U6")], "Redirect-notFrustrated": [(0.70, "U7")] } } elif conversationLength == 'long': t = { "D1": { "Respond-Resolved": [(0.70, "U1")], "Respond-notResolved": [(0.30, "D2")], "Redirect-Frustrated": [(0.60, "U2")], "Redirect-notFrustrated": [(0.40, "U3")] }, "D2": { "Respond-Resolved": [(0.70, "U5")], "Respond-notResolved": [(0.30, "U4")], "Redirect-Frustrated": [(0.60, "U6")], "Redirect-notFrustrated": [(0.40, "U7")] } } init = "D1" terminals = ["U1", "U2", "U3", "U4", "U5", "U6", "U7"] rewards = { "U1": 5, "U2": -1, "U3": 5, "U4": -3, "U5": 5, "U6": -1, "U7": 5, "D2": 0, "D1": 0 } markov = createMDP(init, terminals, t, rewards, gamma=.9) solution = mdp.value_iteration(markov) print(solution)
def main(): number_of_iterations = 10 # expert_mdp = GridMDP([[-10, -5, 0, 0, 10], # [-5, -3, 0, 0, 0], # [0, 0, 0, 0, 0], # [0, 0, 0, 0, 0]], # terminals=[(4,3)]) # expert_mdp = GridMDP([[-10, -5, -3, -1, 0, 0, 0, 0, 0, 10], # [-8, -5, -3, 0, 0, 0, 0, 0, 0, 0], # [-5, -2, -1, 0, 0, 0, 0, 0, 0, 0], # [-3, -1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # terminals=[(9,4)]) # # expert_mdp = GridMDP([[0, 0, 0, 0, -1, -1, 0, 0, 0, 10], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, -3, -5, -5, -3, 0, 0, 0], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]], # terminals=[(9,4)]) # # rewards = [[0, 0, 0, 0, -1, -1, 0, 0, 0, 10], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, -3, -5, -5, -3, 0, 0, 0], # [0, 0, 0, -3, -3, -3, -3, 0, 0, 0], # [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]] # rewards = [[0, 0, 0, 0, -8, -8, 0, 0, 0, 10], [0, 0, 0, -8, -10, -10, -8, 0, 0, 0], [0, 0, 0, -8, -10, -10, -8, 0, 0, 0], [0, 0, 0, -8, -10, -10, -8, 0, 0, 0], [0, 0, 0, 0, 0, -8, -8, 0, 0, 0]] # rewards = [[-6, -3, -1, 0, 0, 0, 0, 0, 0, 10], # [-3, -3, -1, 0, 0, 0, 0, 0, 0, 0], # [-1, -1, -1, 0, 0, 0, 0, -1, -1, -1], # [0, 0, 0, 0, 0, 0, 0, -1, -3, -3], # [0, 0, 0, 0, 0, 0, 0, -1, -3, -6]] # # rewards = [[0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0, 10], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0]] expert_mdp = mdp.GridMDP(rewards, terminals=[(9, 4)]) expert_trace = mdp.best_policy(expert_mdp, mdp.value_iteration(expert_mdp, 0.001)) print "Expert rewards:" expert_mdp.print_rewards() print "Expert policy:" utils.print_table(expert_mdp.to_arrows(expert_trace)) print "---------------" expert_trace.pop((0,1)) expert_trace.pop((0,2)) expert_trace.pop((0,3)) mybirl = birl.BIRL(expert_trace, expert_mdp.get_grid_size(), expert_mdp.terminals, partial(calculate_error_sum, expert_mdp), birl_iteration=2, step_size=1.0) run_multiple_birl(mybirl, expert_mdp, expert_trace, number_of_iterations)
import mdp from sputil import * if __name__ == "__main__": from model2 import * p = map(obs2prob, [s,v,r,g,i,o]) A = make_actions( p[0], p[1], p[2], p[3], p[4], p[5] ) R = make_reward(pragmatic_reward) #print A V,P,ok = mdp.value_iteration(A,R) D = {0:[], 1:[]} for (k,v) in P.items(): D[v].append(k) for k in D.keys(): print A[k].name for x in D[k]: print x #(s,v,r,g,i,o) = x #if r == 0: # print x
def _value_iteration_100_steps(self): values = value_iteration(self.mdp.values, self.mdp, num_iter=100) policy = policy_extraction(values, self.mdp) self.gridworld.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policy(policy)
ax2.plot(S,dp_solution_q) ax2.legend(('SARSA-LAMBDA','Value Iteration')) plt.xlabel('State Index') plt.ylabel('Optimal Value Function V*') plt.title('Comparison of SARSA-LAMBDA and value iteration for gridworld ') plt.show() ''' #-------------------------------- # Q-LAMBDA lambd = 0.9 alpha = 10**(-2) n_episodes = 30000 eps = 0.1 [Q, E] = q_lambda(gamma, lambd, alpha, eps, n_episodes, S, A, sampler) best_action_2, dp_solution_q = mdp.value_iteration(S, A, P, R, gamma, pi) #print(Q) print(dp_solution_q) #best_action_extract best_actions, best_valfn = get_actions(Q, P) print(best_valfn) fig2, ax2 = plt.subplots() ax2.plot(S, best_valfn, marker='x') ax2.plot(S, dp_solution_q) ax2.legend(('Q-LAMBDA', 'Value Iteration')) plt.xlabel('State Index') plt.ylabel('Optimal Value Function V*') plt.title('Comparison of Q-LAMBDA and value iteration for gridworld ') plt.show()
if key2 == 0: key3 = key1 - 4 elif key2 == 1: key3 = key1 + 1 elif key2 == 2: key3 = key1 + 4 elif key2 == 3: key3 = key1 - 1 else: key3 = key1 P_dict[key1][key2][key3] = 1 [S, A, P, R, gamma, pi] = mdp.create_MDP( S, A, P_dict, R_dict, gamma, pi_dict) # Creates the relevant matrices for the gridworld vi = mdp.evaluate_policy(S, A, P, R, gamma, pi) # Evaluates Initial random policy, Pg 12 of DP lecture print( vi ) # Printed values are different due to round offs in lecture numbers best_action, vk = mdp.policy_iteration(S, A, P, R, gamma, pi) # Policy Iteration to find the best policy print( best_action ) # Acc to the code, it finds only 1 best action for a given state, in decreasing priority of #action vector best_action_2, vk = mdp.value_iteration(S, A, P, R, gamma, pi) print(best_action_2)
print("seed", seed) np.random.seed(seed) random.seed(seed) #First let's generate a random MDP state_features = eutils.create_random_features_row_col_m( num_rows, num_cols, num_features) #print("state features\n",state_features) true_weights = random_weights(num_features) print("true weights: ", true_weights) true_world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) print("rewards") true_world.print_rewards() print("value function") V = mdp.value_iteration(true_world) true_world.print_map(V) print("mdp features") utils.display_onehot_state_features(true_world) #find the optimal policy under this MDP Qopt = mdp.compute_q_values(true_world, V=V) opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt) print("optimal policy") true_world.print_map(true_world.to_arrows(opt_policy)) #input() #now find a bunch of other optimal policies for the same MDP but with different weight vectors. #TODO: I wonder if there is a better way to create these eval policies? # Can we efficiently solve for all of them or should they all be close? (e.g. rewards sampled from gaussian centerd on true reward?) world = copy.deepcopy(true_world) eval_policies = [] eval_Qvalues = []
def _value_iteration_1_step(self): values = value_iteration(self.mdp.values, self.mdp, num_iter=1) policy = policy_extraction(values, self.mdp) self.gridworldwindow.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policies(policy)