示例#1
0
        alpha_min=0.001,
        epsilon=1.0,
        epsilon_min=0.1,
        epsilon_decay=0.99,
        n_iter=10000,
        skip_check=False,
        iter_callback=None,
        run_stat_frequency=None)
    ql.run()
    # print('q learning Q matrix:', ql.Q)
    print('q learning value function:', ql.V)
    # print('q learning mean discrepancy:', ql.mean_discrepancy)
    print('q learning best policy:', ql.policy)
    results.append(ql)

plot_rewards(disc, results, 'Q-Learning Discount/Rewards FrozenLake',
             'q_learning_discount_rewards_frozenlake', 'Discount')

results = []
for e in ep:
    ql = QLearning(
        P,  # transitions
        R,  # rewards
        0.55,  # discount
        alpha=0.1,
        alpha_decay=0.99,
        alpha_min=0.001,
        epsilon=e,
        epsilon_min=0.000001,
        epsilon_decay=0.99,
        n_iter=10000,
        skip_check=False,
示例#2
0
    pi = PolicyIteration(
        # pi = PolicyIterationModified(
        P,  # transitions
        R,  # rewards
        d,  # discount
        # epsilon=0.01,
        max_iter=1000,
    )
    pi.run()
    print('policy iteration value function:', pi.V)
    print('policy iteration iterations:', pi.iter)
    print('policy iteration time:', pi.time)
    print('policy iteration best policy:', pi.policy)
    results.append(pi)

plot_rewards(disc, results, 'Policy Iteration Discount/Rewards FrozenLake',
             'policy_iteration_discount_rewards_frozenlake', 'Discount')
results = []
for e in ep:
    pi = PolicyIteration(
        # pi = PolicyIterationModified(
        P,  # transitions
        R,  # rewards
        0.9,  # discount
        # epsilon=e,
        max_iter=1000,
    )
    pi.epsilon = e
    pi.run()
    print('policy iteration value function:', pi.V)
    print('policy iteration iterations:', pi.iter)
    print('policy iteration time:', pi.time)
示例#3
0
ep = [0.00099, 0.001, 0.005, 0.01, 0.03]

ex = OpenAI_MDPToolbox('FrozenLake-v0', False)
P = ex.P
R = ex.R
results = []
for d in disc:
    vi = ValueIteration(P, R, d, epsilon=0.001, max_iter=1000)
    vi.run()
    print('value iteration value function:', vi.V)
    print('value iteration iterations:', vi.iter)
    print('value iteration time:', vi.time)
    print('value iteration best policy:', vi.policy)
    results.append(vi)

plot_rewards(disc, results, 'Value Iteration Discount/Rewards FrozenLake',
             'value_iteration_discount_rewards_frozenlake', 'Discount')

results = []
for e in ep:
    vi = ValueIteration(P, R, 0.9, epsilon=e, max_iter=1000)
    vi.run()
    print('value iteration value function:', vi.V)
    print('value iteration iterations:', vi.iter)
    print('value iteration time:', vi.time)
    print('value iteration best policy:', vi.policy)
    results.append(vi)

plot_rewards(ep, results, 'Value Iteration Epsilon/Rewards FrozenLake',
             'value_iteration_epsilon_rewards_frozenlake', 'Epsilon')

print('----------------Best VI FrozenLake---------------')
示例#4
0
    pi = PolicyIteration(
    # pi = PolicyIterationModified(
        P, # transitions
        R, # rewards
        d, # discount
        max_iter=1000,
    )
    pi.run()
    print('policy iteration value function:', pi.V)
    print('policy iteration iterations:', pi.iter)
    print('policy iteration time:', pi.time)
    print('policy iteration best policy:', pi.policy)
    results.append(pi)

plot_rewards(
    disc, results, 'Policy Iteration Discount/Rewards Forest',
    'policy_iteration_discount_rewards_forest', 'Discount'
)
results = []
for e in ep:
    pi = PolicyIteration(
    # pi = PolicyIterationModified(
        P, # transitions
        R, # rewards
        0.9, # discount
        # epsilon=e,
        max_iter=1000,
    )
    pi.epsilon = e
    pi.run()
    print('policy iteration value function:', pi.V)
    print('policy iteration iterations:', pi.iter)
disc = [0.1, 0.3, 0.5, 0.7, 0.9]
ep = [0.00099, 0.001, 0.005, 0.01, 0.03]

P, R = mdptoolbox.example.forest(S=500, r1=100, r2=2, p=0.1, is_sparse=False)
results = []
for d in disc:
    vi = ValueIteration(P, R, d, epsilon=0.001, max_iter=1000)
    vi.run()
    print('value iteration value function:', vi.V)
    print('value iteration iterations:', vi.iter)
    print('value iteration time:', vi.time)
    print('value iteration best policy:', vi.policy)
    results.append(vi)

plot_rewards(disc, results, 'Value Iteration Discount/Rewards Forest',
             'value_iteration_discount_rewards_forest', 'Discount')

results = []
for e in ep:
    vi = ValueIteration(P, R, 0.9, epsilon=e, max_iter=1000)
    vi.run()
    print('value iteration value function:', vi.V)
    print('value iteration iterations:', vi.iter)
    print('value iteration time:', vi.time)
    print('value iteration best policy:', vi.policy)
    results.append(vi)

plot_rewards(ep, results, 'Value Iteration Epsilon/Rewards Forest',
             'value_iteration_epsilon_rewards_forest', 'Epsilon')

print('----------------Best VI Forest---------------')