def run_experiment(problem, prefix, gamma, shape=None): """Run a policy iteration experiment. Args: problem (str): Gym problem name. prefix (str): Prefix for CSV and plot outputs. gamma (float): Gamma value. shape (tuple(int)): Shape of state space array. """ problem = gym.make(problem) policy, rewards, iters, value_fn = policy_iteration(problem, gamma=gamma) idxs = [i for i in range(0, iters)] print('{}: {} iterations to converge'.format(prefix, iters)) # save results as CSV resdir = 'results/PI' q = get_abspath('{}_policy.csv'.format(prefix), resdir) r = get_abspath('{}_rewards.csv'.format(prefix), resdir) v = get_abspath('{}_value_fn.csv'.format(prefix), resdir) pdf = pd.DataFrame(policy) rdf = pd.DataFrame(np.column_stack([idxs, rewards]), columns=['k', 'r']) vdf = pd.DataFrame(value_fn) pdf.to_csv(q, index=False) rdf.to_csv(r, index=False) vdf.to_csv(v, index=False) # plot results tdir = 'plots/PI' polgrid = pdf.as_matrix().reshape(shape) heatmap = vdf.as_matrix().reshape(shape) plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid) return iters
def run_experiment(problem, prefix, alpha, gamma, d, shape=None): """Run Q-Learning experiment for specified Gym problem and write results to CSV files. Args: problem (str): Gym problem name. alpha (float): Learning rate. gamma (float): Discount factor. d : Epsilon decay rate. shape (tuple(int)): Shape of state space matrix. prefix (str): Prefix for CSV and plot outputs. """ episodes = 5000 size = episodes // 100 # instantiate environment and run Q-learner start = time.time() env = gym.make(problem) Q, rewards, visits = q_learning(env, alpha, d, gamma) env.close() end = time.time() elapsed = end - start # average rewards k = [i for i in range(0, episodes, size)] chunks = list(chunk_list(rewards, size)) rewards = [sum(chunk) / len(chunk) for chunk in chunks] # save results as CSV resdir = 'results/QL' qf = get_abspath('{}_policy.csv'.format(prefix), resdir) rf = get_abspath('{}_rewards.csv'.format(prefix), resdir) vf = get_abspath('{}_visits.csv'.format(prefix), resdir) qdf = pd.DataFrame(Q) vdf = pd.DataFrame(visits) rdf = pd.DataFrame(np.column_stack([k, rewards]), columns=['k', 'r']) qdf.to_csv(qf, index=False) vdf.to_csv(vf, index=False) rdf.to_csv(rf, index=False) # write timing results and average reward in last iteration combined = get_abspath('summary.csv', 'results/QL') with open(combined, 'a') as f: f.write('{},{},{}\n'.format(prefix, elapsed, rdf.iloc[-1, 1])) # plot results tdir = 'plots/QL' polgrid = qdf.as_matrix().argmax(axis=1).reshape(shape) heatmap = vdf.as_matrix().reshape(shape) plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid)