示例#1
0
def run_experiment(problem, prefix, gamma, shape=None):
    """Run a policy iteration experiment.

    Args:
        problem (str): Gym problem name.
        prefix (str): Prefix for CSV and plot outputs.
        gamma (float): Gamma value.
        shape (tuple(int)): Shape of state space array.

    """
    problem = gym.make(problem)
    policy, rewards, iters, value_fn = policy_iteration(problem, gamma=gamma)
    idxs = [i for i in range(0, iters)]
    print('{}: {} iterations to converge'.format(prefix, iters))

    # save results as CSV
    resdir = 'results/PI'
    q = get_abspath('{}_policy.csv'.format(prefix), resdir)
    r = get_abspath('{}_rewards.csv'.format(prefix), resdir)
    v = get_abspath('{}_value_fn.csv'.format(prefix), resdir)
    pdf = pd.DataFrame(policy)
    rdf = pd.DataFrame(np.column_stack([idxs, rewards]), columns=['k', 'r'])
    vdf = pd.DataFrame(value_fn)
    pdf.to_csv(q, index=False)
    rdf.to_csv(r, index=False)
    vdf.to_csv(v, index=False)

    # plot results
    tdir = 'plots/PI'
    polgrid = pdf.as_matrix().reshape(shape)
    heatmap = vdf.as_matrix().reshape(shape)
    plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid)

    return iters
示例#2
0
def run_experiment(problem, prefix, alpha, gamma, d, shape=None):
    """Run Q-Learning experiment for specified Gym problem and write results
    to CSV files.

    Args:
        problem (str): Gym problem name.
        alpha (float): Learning rate.
        gamma (float): Discount factor.
        d : Epsilon decay rate.
        shape (tuple(int)): Shape of state space matrix.
        prefix (str): Prefix for CSV and plot outputs.

    """
    episodes = 5000
    size = episodes // 100

    # instantiate environment and run Q-learner
    start = time.time()
    env = gym.make(problem)
    Q, rewards, visits = q_learning(env, alpha, d, gamma)
    env.close()
    end = time.time()
    elapsed = end - start

    # average rewards
    k = [i for i in range(0, episodes, size)]
    chunks = list(chunk_list(rewards, size))
    rewards = [sum(chunk) / len(chunk) for chunk in chunks]

    # save results as CSV
    resdir = 'results/QL'
    qf = get_abspath('{}_policy.csv'.format(prefix), resdir)
    rf = get_abspath('{}_rewards.csv'.format(prefix), resdir)
    vf = get_abspath('{}_visits.csv'.format(prefix), resdir)
    qdf = pd.DataFrame(Q)
    vdf = pd.DataFrame(visits)
    rdf = pd.DataFrame(np.column_stack([k, rewards]), columns=['k', 'r'])
    qdf.to_csv(qf, index=False)
    vdf.to_csv(vf, index=False)
    rdf.to_csv(rf, index=False)

    # write timing results and average reward in last iteration
    combined = get_abspath('summary.csv', 'results/QL')
    with open(combined, 'a') as f:
        f.write('{},{},{}\n'.format(prefix, elapsed, rdf.iloc[-1, 1]))

    # plot results
    tdir = 'plots/QL'
    polgrid = qdf.as_matrix().argmax(axis=1).reshape(shape)
    heatmap = vdf.as_matrix().reshape(shape)
    plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid)