示例#1
0
def Q_learner(P, R, id=None):
    alpha_mins = [0.0001, 0.01]
    epsilon_decays = [0.99, 0.999]
    gammas = [0.99, 0.7]
    tracker = ''
    Rs = []
    runtimes = []
    params = []
    policies = []
    for g in gammas:
        print(g)
        for ed in epsilon_decays:
            for am in alpha_mins:
                start = time.time()
                Q = QLearning(P,
                              R,
                              gamma=g,
                              alpha_min=am,
                              epsilon_decay=ed,
                              n_iter=10000000)
                Q.run()
                end = time.time()
                runtimes.append(end - start)
                r = test_policy(P, R, Q.policy)
                Rs.append(r)
                policies.append(Q.policy)
                params.append('gamma={}, a_min={}, e_dec={}'.format(g, am, ed))
                tracker += 'gamma={}, alpha_min={}, eplison_dec={}: reward was {}, time was {}\n'.format(
                    g, am, ed, r, end - start)

    # write
    with open('figures/Q_variables_forest_{}_mil.txt'.format(id), 'w') as f:
        f.write(tracker)

    with open('figures/Q_policies_forest_{}_mil.txt'.format(id), 'w') as f:
        for i, p in enumerate(params):
            f.write('{}: policy={}'.format(p, policies[i]))

    # plot
    plt.plot(params, Rs)
    plt.title('Q learning params avg reward')
    plt.ylabel('Avg rewards')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig('figures/Q_rewards_forest_{}_mil.png'.format(id))
    plt.clf()

    plt.plot(params, runtimes)
    plt.title('Q learning runtimes')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig('figures/Q_runtimes_forest_{}_mil.png'.format(id))
    plt.clf()
    print('done')
示例#2
0
class QL:
    """ Class to run QL """
    def __init__(self, name, transition, reward, config, outdir):
        """ Constructor for QL """
        self.name = name
        self.title = "QL"
        self.transition = transition
        self.reward = reward
        self.outdir = outdir
        self.config = config
        self.policy = None
        self.results = None
        self.dataframe = None
        self.instance = QLearning(transition,
                                  reward,
                                  gamma=config['gamma'],
                                  n_iter=config['n_iter'],
                                  alpha=config['alpha'],
                                  alpha_decay=config['alpha_decay'],
                                  alpha_min=config['alpha_min'],
                                  epsilon=config['epsilon'],
                                  epsilon_decay=config['epsilon_decay'],
                                  epsilon_min=config['epsilon_min'])

    def run(self):
        """ Run QL """
        self.results = self.instance.run()
        self.dataframe = pd.DataFrame(self.results)
        self.policy = self.instance.policy
示例#3
0
def qlearning():
    deltas = {}
    rewards = {}
    for size in [10, 20, 40, 80]:
        P, R = forest(S=size, r1=1, r2=5, p=.1)
        ql = QLearning(P, R, 0.90, epsilon_decay=.998)
        ql.run()
        delta = [ql.run_stats[i]['Error'] for i in range(len(ql.run_stats))]
        reward = [ql.run_stats[i]['Reward'] for i in range(len(ql.run_stats))]
        epilson = [
            ql.run_stats[i]['Epsilon'] for i in range(len(ql.run_stats))
        ]
        deltas[size] = delta
        rewards[size] = reward
        print(ql.policy)

    forest_plot.plot_ql_forest_convergence_size(deltas)
示例#4
0
def q_learning(P, R, epsilon, discount=[0.9], n_iter=[1000000]):
    df_ql = pd.DataFrame(columns=["Iterations", "Discount", "Reward", "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for disc in discount:
            q = QLearning(P, R, disc, epsilon = epsilon, n_iter = i)
            q.run()
            reward = test_policy(P, R, q.policy)
            count += 1
            print("{}: {}".format(count, reward))
            st = q.run_stats
            t_rewards = [s['Reward'] for s in st]
            info = [i, disc, reward, q.time, q.policy, q.V, t_rewards]

            df_length = len(df_ql)
            df_ql.loc[df_length] = info
    return df_ql
示例#5
0
ex = OpenAI_MDPToolbox('FrozenLake-v0', False)
P = ex.P
R = ex.R
disc = [0.1, 0.3, 0.5, 0.7, 0.9]
ep = [0.00099, 0.001, 0.005, 0.01, 0.03]
alpha = [1.0, 0.9, 0.5, 0.3, 0.1, 0.01]

results = []
for d in disc:
    ql = QLearning(
        P,  # transitions
        R,  # rewards
        d,  # discount
        alpha=0.1,
        alpha_decay=0.99,
        alpha_min=0.001,
        epsilon=1.0,
        epsilon_min=0.1,
        epsilon_decay=0.99,
        n_iter=10000,
        skip_check=False,
        iter_callback=None,
        run_stat_frequency=None)
    ql.run()
    # print('q learning Q matrix:', ql.Q)
    print('q learning value function:', ql.V)
    # print('q learning mean discrepancy:', ql.mean_discrepancy)
    print('q learning best policy:', ql.policy)
    results.append(ql)

plot_rewards(disc, results, 'Q-Learning Discount/Rewards FrozenLake',
             'q_learning_discount_rewards_frozenlake', 'Discount')
示例#6
0
    fig, ax = plt.subplots()
    ax.plot(iterations, reward)

    ax.set(xlabel="Iterations", ylabel="Reward", title="Frozen Lake Policy Iteration")
    ax.grid()

    fig.savefig("frozen-lake.pi.png")

    print("== Q Learning ==")
    values = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.99]
    resultRewards = [None] * len(values)
    resultIterations = [None] * len(values)
    i = 0
    for v in values:
        QL = QLearning(
            P, R, Gamma, n_iter=10000000, epsilon=0.1, epsilon_decay=v, epsilon_min=0.1
        )
        # run QL
        QL.setVerbose()
        QL.run()
        print("QL")
        print(QL.time)
        print(QL.run_stats[-1:])

        resultIterations[i] = np.zeros(len(QL.run_stats))
        resultRewards[i] = np.zeros(len(QL.run_stats))
        j = 0
        sum = 0
        for stat in QL.run_stats:
            sum += stat["Reward"]
            resultIterations[i][j] = stat["Iteration"]
示例#7
0
            R[s][a] += decision[2]


value_f = []
policy = []
iters = []
time_array = []
Q_table = []
rew_array = []

# Plots for variable iterations

niters = [10000, 25000, 50000, 100000, 250000, 500000]
for niter in niters:
    print("doing iteration ", niter)
    ql = QLearning(P, R, 0.95, n_iter=niter)
    ql.run()
    time = ql.time
    maxV = np.amax(ql.V)
    rew_array.append(maxV)
    Q_table.append(ql.Q)
    policy.append(ql.policy)
    time_array.append(time)

plt.figure()
plt.plot(niters, rew_array, label='epsilon=0.95')
plt.title('Frozenlake QLearning: Iteration vs average rewards')
plt.savefig(plot_path + 'qlearning_iteration_rewards_analysis.png')

plt.figure()
plt.plot(niters, time_array, label='epsilon=0.95')
示例#8
0
    P, R = forest(num_states, r1, r2, p_fire)
    vi = ValueIteration(P, R, 0.96, 1e-20)
    vi.run()

    P2, R2 = forest(num_states, r1, r2, 0.8)
    vi2 = ValueIteration(P2, R2, 0.96, 1e-20)
    vi2.run()

    # # calculate and plot the v_mean
    # iter_score(vi, vi2)

    # gamma_iter_value()
    # #
    #

    pi = PolicyIteration(P, R, 0.96)
    pi.run()

    pi2 = PolicyIteration(P2, R2, 0.96)
    pi2.run()
    # iter_score(pi, pi2)
    # #iter_policy(pi, pi2)
    # gamma_iter_value_p()

    q = QLearning(P, R, 0.4, alpha=0.9, n_iter=100000)
    q.run()

    q2 = QLearning(P2, R2, 0.4, alpha=0.9, n_iter=100000)
    q2.run()
    iter_score(q, q2)
示例#9
0
reward = np.zeros(len(PI.run_stats))
i = 0
for stat in PI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']
    i += 1

fig, ax = plt.subplots()
ax.plot(iterations, reward)

ax.set(xlabel='Iterations', ylabel='Reward', title='Forest Policy Iteration')
ax.grid()

fig.savefig("forest.pi.png")

QL = QLearning(P, R, Gamma, n_iter=1000000, alpha_decay=0.1)
# run QL
QL.setVerbose()
QL.run()
print('QL')
print(QL.time)
print(QL.run_stats[-1:])

iterations = np.zeros(len(QL.run_stats))
reward = np.zeros(len(QL.run_stats))
i = 0
sum = 0
for stat in QL.run_stats:
    sum += stat['Reward']
    iterations[i] = stat['Iteration']
    reward[i] = sum