def solve_mdp_policy(): """Solve the problem as a policy iteration Markov decision process. """ P, R = get_transition_and_reward_arrays() sdp = mdp.PolicyIteration(P, R, 0.96, policy0=None, max_iter=1000) sdp.run() return sdp
def calc_opt_policy(self, expected_horizon=10**5, discount=1, epsilon=1e-5, max_iter=100000, skip_check=True, verbose=False): self.build_mdp() p_mat, r_mat = self.get_pt_mdp(expected_horizon) vi = mdptoolbox.PolicyIteration(p_mat, r_mat, discount=discount, epsilon=epsilon, max_iter=max_iter, skip_check=skip_check) if verbose: vi.setVerbose() vi.run() self.opt_policy = vi.policy return self.opt_policy, vi.V[ self.initial_state] / expected_horizon, vi.iter
def tictactoe(gamma=0.95): outdir = mktmpdir('a4_ttt') timings = {} print('====== Running Tic Tac Toe =======') gamma = 0.95 P, R = ttt.getTransitionAndRewardArrays() print('\nValue Iteration') ttt_vi = mdp.ValueIteration(P, R, gamma) ttt_vi.setVerbose() vi_time = default_timer() ttt_vi.run() vi_time = default_timer() - vi_time print(f'MDP Toolbox VI finished in {ttt_vi.iter} iterations') print(f'Accumulated reward: {len(ttt_vi.rewards)}') print(f'Rewards: {ttt_vi.rewards}') save_stats(outdir, f'vi', ttt_vi) print('\nPolicy Iteration') ttt_pi = mdp.PolicyIteration(P, R, gamma) ttt_pi.setVerbose() pi_time = default_timer() ttt_pi.run() pi_time = default_timer() - pi_time print(f'MDP Toolbox PI finished in {ttt_pi.iter} iterations') print(f'Accumulated reward: {len(ttt_pi.rewards)}') print(f'Rewards: {ttt_pi.rewards}') save_stats(outdir, 'pi', ttt_pi) print('PI/VI same policy?: {}'.format( np.all(ttt_vi.policy == ttt_pi.policy))) save_stats(outdir, 'pi_policy', ttt_pi.policy) save_stats(outdir, 'vi_policy', ttt_vi.policy) # Q vs random epsilons = [0.4, 0.9] rewards = [] agents = [] qtimes = [] for i, epsilon in enumerate(epsilons): qtimes.append(default_timer()) r, agent = ttt.train_agents('random', 500000, epsilon, 0.9, 0.4, 0.9, 0.99, False) qtimes[i] = default_timer() - qtimes[i] rewards.append(r) agents.append(agent) qpolicy = agent.policy() save_stats(outdir, f'ttt_agents{epsilon}', agent) save_stats(outdir, f'ttt_rewards{epsilon}', r) save_stats(outdir, f'q_policy_{epsilon}', qpolicy) # print(f'{epsilon} policy same as vi?: {np.all(ttt_vi.policy == qpolicy)}') timings = { # 'vi': vi_time, # 'pi': pi_time, 'q_eps4': qtimes[0], 'q_eps7': qtimes[1] } print(timings)
def run_simulation(self, *args, **kwargs): mdp_planner = mdp.PolicyIteration(P, R, 0.9) mdp_planner.run() mdp_policy = mdp_planner.policy driver_policy = mdp_policy[driver["index"]] action = Actions(driver_policy) ideal_dest = { "col": driver["col"] + 1 if action is Actions.EAST and driver["col"] != (col_count - 1) else driver["col"] - 1 if action is Actions.WEST and driver["col"] != 0 else driver["col"], "row": driver["row"] + 1 if action is Actions.SOUTH and driver["row"] != (row_count - 1) else driver["row"] - 1 if action is Actions.NORTH and driver["row"] != 0 else driver["row"] } ideal_dest_index = (row_count * ideal_dest["row"]) + ideal_dest["col"] action_prob = P[action.value, driver["index"], ideal_dest_index] policy_succeed = np.random.choice([0, 1], 1, p=[1 - action_prob, action_prob])[0] self.simulation_detail = { "action": action, "source": { "col": driver["col"], "row": driver["row"], "index": driver["index"] }, "dest": { "col": ideal_dest["col"] if policy_succeed else driver["col"], "row": ideal_dest["row"] if policy_succeed else driver["row"], "index": ideal_dest_index if policy_succeed else driver["index"] } } self.steps = self.steps + 1 self.stepsCounter.setText(str(self.steps)) self.isPickedUp = True if self.isPickedUp or self.simulation_detail[ "dest"]["index"] == client["index"] else False self.pickedUpStatus.setText(str(self.isPickedUp)) self.pickedUpStatus.setStyleSheet( "color: {}".format("green" if self.isPickedUp else "red")) print("dest", dest) print("sim", self.simulation_detail["dest"]) self.isArrivedDest = True if self.isArrivedDest or self.simulation_detail[ "dest"]["index"] == dest["index"] else False self.arrivedDestStatus.setText(str(self.isArrivedDest)) self.arrivedDestStatus.setStyleSheet( "color: {}".format("green" if self.isArrivedDest else "red")) self.simulationRan.emit(self.simulation_detail)
def main(): transitions, reward, discount, lake = get_environement() #Policy iteration policy_iteration = mdp.PolicyIteration(transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0) policy_iteration.run() print_as_grid(policy_iteration.policy, lake, 5) print(policy_iteration.time) print(policy_iteration.iter) # #Value iteration value_iteration = mdp.ValueIteration(transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0) value_iteration.run() print_as_grid(value_iteration.policy, lake, 5) print(value_iteration.time) print(value_iteration.iter) # #MDP q_learning = mdp.QLearning(transitions, reward, discount, n_iter=20000000) q_learning.run() print_as_grid(q_learning.policy, lake, 5) print(q_learning.time)
def fit_policy(st, rm, gamma, num_states): iterations = list(range(1, 1000, 10)) data_policy = {} data_policy['convergence'] = {} for iter in iterations: print('Current Iteration: {}'.format(iter)) data_policy[str(iter)] = {} tot_time_start = time.time() vi = mdp.PolicyIteration(st, rm, gamma, max_iter=10000000, eval_type=1) # vi.setVerbose() time_iter, iter_value, iter_policy, policy_change, policies = vi.run( max_iter=iter) tot_time_end = time.time() tot_time = tot_time_end - tot_time_start policy_change = [int(x) for x in policy_change] if (np.any(np.array(iter_value) > iter)): raise ValueError( 'Value loop of Policy Iteration not stopping at maximum iterations provided' ) data_policy[str(iter)]['tot_time'] = tot_time data_policy[str(iter)]['time_iter'] = time_iter data_policy[str(iter)]['policy_iter'] = iter_policy data_policy[str(iter)]['value_iter'] = iter_value data_policy[str(iter)]['policy_change'] = policy_change print('Convergence') tot_time_start = time.time() vi = mdp.PolicyIteration(st, rm, gamma, max_iter=10000000, eval_type=1) time_iter, iter_value, iter_policy_policy, policy_change, policies = vi.run( max_iter=10000) tot_time_end = time.time() policy_change = [int(x) for x in policy_change] policies = [tuple(int(x) for x in opt_policy) for opt_policy in policies] optimal_policy = vi.policy expected_values = vi.V optimal_policy = tuple(int(x) for x in optimal_policy) expected_values = tuple(float(x) for x in expected_values) optimal_policy = dict(zip(list(range(num_states)), list(optimal_policy))) expected_values = list(expected_values) policies = [ dict(zip(list(range(num_states)), list(opt_policy))) for opt_policy in policies ] data_policy['convergence']['tot_time'] = tot_time_end - tot_time_start data_policy['convergence']['time_iter'] = time_iter data_policy['convergence']['policy_iter'] = iter_policy_policy data_policy['convergence']['value_iter'] = iter_value data_policy['convergence']['policy_change'] = policy_change data_policy['convergence']['optimal_policy'] = optimal_policy data_policy['convergence']['expected_values'] = expected_values data_policy['convergence']['policies'] = policies return data_policy
#FiniteHorizon fh_class = mdp.FiniteHorizon(T, R, discountFactor, N = iterations) fh_class.run() policy = [] policy_iterations = fh_class.policy for state in policy_iterations: policy.append(state[iterations-1]) all_policies["FiniteHorizon"] = tuple(policy) print("FiniteHorizon duration:", fh_class.time) print("FiniteHorizon iterations: N.A") print("_________________") #PolicyIteraiton pi_class = mdp.PolicyIteration(T, R, discountFactor, max_iter=iterations) pi_class.run() all_policies["PolicyIteration"] = pi_class.policy print("PolicyIteraiton duration:", pi_class.time) print("PolicyIteraiton iterations:", pi_class.iter) print("_________________") #PolicyIterationModified pim_class = mdp.PolicyIterationModified(T, R, discountFactor, max_iter=iterations) pim_class.run() all_policies["PolicyIterationModified"] = pim_class.policy print("PolicyIterationModified duration:", pim_class.time) print("PolicyIterationModified iterations:", pim_class.iter) print("_________________")
S2 = sum(1 if x == OPPONENT else 0 for x in state) if (S1, S2) in OWNED_CELLS: return True else: return False P, R = getTransitionAndRewardArrays() for discount in np.arange(.1, 1, .2): ttt = mdp.ValueIteration(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for discount in np.arange(.1, 1, .2): ttt = mdp.PolicyIteration(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for discount in np.arange(.1, 1, .2): qlearner_stats = collections.defaultdict(list) ttt = hmdp.QLearning(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for stats in ttt.run_stats: qlearner_stats['state'].append(stats['State']) qlearner_stats['action'].append(stats['Action'])