Пример #1
0
def solve_mdp_policy():
    """Solve the problem as a policy iteration Markov decision process.
    """
    P, R = get_transition_and_reward_arrays()
    sdp = mdp.PolicyIteration(P, R, 0.96, policy0=None, max_iter=1000)
    sdp.run()
    return sdp
Пример #2
0
    def calc_opt_policy(self,
                        expected_horizon=10**5,
                        discount=1,
                        epsilon=1e-5,
                        max_iter=100000,
                        skip_check=True,
                        verbose=False):
        self.build_mdp()

        p_mat, r_mat = self.get_pt_mdp(expected_horizon)

        vi = mdptoolbox.PolicyIteration(p_mat,
                                        r_mat,
                                        discount=discount,
                                        epsilon=epsilon,
                                        max_iter=max_iter,
                                        skip_check=skip_check)
        if verbose:
            vi.setVerbose()
        vi.run()

        self.opt_policy = vi.policy

        return self.opt_policy, vi.V[
            self.initial_state] / expected_horizon, vi.iter
Пример #3
0
def tictactoe(gamma=0.95):
    outdir = mktmpdir('a4_ttt')
    timings = {}
    print('====== Running Tic Tac Toe =======')
    gamma = 0.95
    P, R = ttt.getTransitionAndRewardArrays()

    print('\nValue Iteration')
    ttt_vi = mdp.ValueIteration(P, R, gamma)
    ttt_vi.setVerbose()
    vi_time = default_timer()
    ttt_vi.run()
    vi_time = default_timer() - vi_time
    print(f'MDP Toolbox VI finished in {ttt_vi.iter} iterations')
    print(f'Accumulated reward: {len(ttt_vi.rewards)}')
    print(f'Rewards: {ttt_vi.rewards}')
    save_stats(outdir, f'vi', ttt_vi)

    print('\nPolicy Iteration')
    ttt_pi = mdp.PolicyIteration(P, R, gamma)
    ttt_pi.setVerbose()
    pi_time = default_timer()
    ttt_pi.run()
    pi_time = default_timer() - pi_time
    print(f'MDP Toolbox PI finished in {ttt_pi.iter} iterations')
    print(f'Accumulated reward: {len(ttt_pi.rewards)}')
    print(f'Rewards: {ttt_pi.rewards}')
    save_stats(outdir, 'pi', ttt_pi)

    print('PI/VI same policy?: {}'.format(
        np.all(ttt_vi.policy == ttt_pi.policy)))
    save_stats(outdir, 'pi_policy', ttt_pi.policy)
    save_stats(outdir, 'vi_policy', ttt_vi.policy)

    # Q vs random
    epsilons = [0.4, 0.9]
    rewards = []
    agents = []
    qtimes = []
    for i, epsilon in enumerate(epsilons):
        qtimes.append(default_timer())
        r, agent = ttt.train_agents('random', 500000, epsilon, 0.9, 0.4, 0.9,
                                    0.99, False)
        qtimes[i] = default_timer() - qtimes[i]
        rewards.append(r)
        agents.append(agent)
        qpolicy = agent.policy()

        save_stats(outdir, f'ttt_agents{epsilon}', agent)
        save_stats(outdir, f'ttt_rewards{epsilon}', r)
        save_stats(outdir, f'q_policy_{epsilon}', qpolicy)
        # print(f'{epsilon} policy same as vi?: {np.all(ttt_vi.policy == qpolicy)}')

    timings = {
        # 'vi': vi_time,
        # 'pi': pi_time,
        'q_eps4': qtimes[0],
        'q_eps7': qtimes[1]
    }
    print(timings)
Пример #4
0
    def run_simulation(self, *args, **kwargs):
        mdp_planner = mdp.PolicyIteration(P, R, 0.9)
        mdp_planner.run()
        mdp_policy = mdp_planner.policy
        driver_policy = mdp_policy[driver["index"]]
        action = Actions(driver_policy)
        ideal_dest = {
            "col":
            driver["col"] + 1 if action is Actions.EAST and driver["col"] !=
            (col_count - 1) else driver["col"] - 1 if action is Actions.WEST
            and driver["col"] != 0 else driver["col"],
            "row":
            driver["row"] + 1 if action is Actions.SOUTH and driver["row"] !=
            (row_count - 1) else driver["row"] - 1 if action is Actions.NORTH
            and driver["row"] != 0 else driver["row"]
        }
        ideal_dest_index = (row_count * ideal_dest["row"]) + ideal_dest["col"]
        action_prob = P[action.value, driver["index"], ideal_dest_index]
        policy_succeed = np.random.choice([0, 1],
                                          1,
                                          p=[1 - action_prob, action_prob])[0]
        self.simulation_detail = {
            "action": action,
            "source": {
                "col": driver["col"],
                "row": driver["row"],
                "index": driver["index"]
            },
            "dest": {
                "col": ideal_dest["col"] if policy_succeed else driver["col"],
                "row": ideal_dest["row"] if policy_succeed else driver["row"],
                "index":
                ideal_dest_index if policy_succeed else driver["index"]
            }
        }

        self.steps = self.steps + 1
        self.stepsCounter.setText(str(self.steps))

        self.isPickedUp = True if self.isPickedUp or self.simulation_detail[
            "dest"]["index"] == client["index"] else False
        self.pickedUpStatus.setText(str(self.isPickedUp))
        self.pickedUpStatus.setStyleSheet(
            "color: {}".format("green" if self.isPickedUp else "red"))

        print("dest", dest)
        print("sim", self.simulation_detail["dest"])
        self.isArrivedDest = True if self.isArrivedDest or self.simulation_detail[
            "dest"]["index"] == dest["index"] else False
        self.arrivedDestStatus.setText(str(self.isArrivedDest))
        self.arrivedDestStatus.setStyleSheet(
            "color: {}".format("green" if self.isArrivedDest else "red"))

        self.simulationRan.emit(self.simulation_detail)
Пример #5
0
def main():
	transitions, reward, discount, lake = get_environement()
	
	#Policy iteration
	policy_iteration = mdp.PolicyIteration(transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0)
	policy_iteration.run()
	print_as_grid(policy_iteration.policy, lake, 5)
	print(policy_iteration.time)
	print(policy_iteration.iter)

	# #Value iteration
	value_iteration = mdp.ValueIteration(transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0)
	value_iteration.run()
	print_as_grid(value_iteration.policy, lake, 5)
	print(value_iteration.time)
	print(value_iteration.iter)

	# #MDP
	q_learning = mdp.QLearning(transitions, reward, discount, n_iter=20000000)
	q_learning.run()
	print_as_grid(q_learning.policy, lake, 5)
	print(q_learning.time)
Пример #6
0
def fit_policy(st, rm, gamma, num_states):

    iterations = list(range(1, 1000, 10))
    data_policy = {}
    data_policy['convergence'] = {}

    for iter in iterations:

        print('Current Iteration: {}'.format(iter))

        data_policy[str(iter)] = {}

        tot_time_start = time.time()
        vi = mdp.PolicyIteration(st, rm, gamma, max_iter=10000000, eval_type=1)
        # vi.setVerbose()
        time_iter, iter_value, iter_policy, policy_change, policies = vi.run(
            max_iter=iter)
        tot_time_end = time.time()
        tot_time = tot_time_end - tot_time_start

        policy_change = [int(x) for x in policy_change]
        if (np.any(np.array(iter_value) > iter)):
            raise ValueError(
                'Value loop of Policy Iteration not stopping at maximum iterations provided'
            )

        data_policy[str(iter)]['tot_time'] = tot_time
        data_policy[str(iter)]['time_iter'] = time_iter
        data_policy[str(iter)]['policy_iter'] = iter_policy
        data_policy[str(iter)]['value_iter'] = iter_value
        data_policy[str(iter)]['policy_change'] = policy_change

    print('Convergence')
    tot_time_start = time.time()
    vi = mdp.PolicyIteration(st, rm, gamma, max_iter=10000000, eval_type=1)
    time_iter, iter_value, iter_policy_policy, policy_change, policies = vi.run(
        max_iter=10000)
    tot_time_end = time.time()

    policy_change = [int(x) for x in policy_change]
    policies = [tuple(int(x) for x in opt_policy) for opt_policy in policies]
    optimal_policy = vi.policy
    expected_values = vi.V
    optimal_policy = tuple(int(x) for x in optimal_policy)
    expected_values = tuple(float(x) for x in expected_values)

    optimal_policy = dict(zip(list(range(num_states)), list(optimal_policy)))
    expected_values = list(expected_values)
    policies = [
        dict(zip(list(range(num_states)), list(opt_policy)))
        for opt_policy in policies
    ]

    data_policy['convergence']['tot_time'] = tot_time_end - tot_time_start
    data_policy['convergence']['time_iter'] = time_iter
    data_policy['convergence']['policy_iter'] = iter_policy_policy
    data_policy['convergence']['value_iter'] = iter_value
    data_policy['convergence']['policy_change'] = policy_change
    data_policy['convergence']['optimal_policy'] = optimal_policy
    data_policy['convergence']['expected_values'] = expected_values
    data_policy['convergence']['policies'] = policies

    return data_policy
Пример #7
0
#FiniteHorizon
fh_class = mdp.FiniteHorizon(T, R, discountFactor, N = iterations)
fh_class.run()
policy = []
policy_iterations = fh_class.policy
for state in policy_iterations:
  policy.append(state[iterations-1])
all_policies["FiniteHorizon"] = tuple(policy)
print("FiniteHorizon duration:", fh_class.time)
print("FiniteHorizon iterations: N.A")

print("_________________")

#PolicyIteraiton
pi_class = mdp.PolicyIteration(T, R, discountFactor, max_iter=iterations)
pi_class.run()
all_policies["PolicyIteration"] = pi_class.policy
print("PolicyIteraiton duration:", pi_class.time)
print("PolicyIteraiton iterations:", pi_class.iter)

print("_________________")

#PolicyIterationModified
pim_class = mdp.PolicyIterationModified(T, R, discountFactor, max_iter=iterations)
pim_class.run()
all_policies["PolicyIterationModified"] = pim_class.policy
print("PolicyIterationModified duration:", pim_class.time)
print("PolicyIterationModified iterations:", pim_class.iter)

print("_________________")
Пример #8
0
        S2 = sum(1 if x == OPPONENT else 0 for x in state)
        if (S1, S2) in OWNED_CELLS:
            return True
        else:
            return False

    P, R = getTransitionAndRewardArrays()
    for discount in np.arange(.1, 1, .2):
        ttt = mdp.ValueIteration(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start

    for discount in np.arange(.1, 1, .2):
        ttt = mdp.PolicyIteration(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start

    for discount in np.arange(.1, 1, .2):
        qlearner_stats = collections.defaultdict(list)
        ttt = hmdp.QLearning(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start
        for stats in ttt.run_stats:
            qlearner_stats['state'].append(stats['State'])
            qlearner_stats['action'].append(stats['Action'])