def __init__(self, img, scale, speed, learning): Animat.__init__(self, img, scale, speed) self.id = 1 self.energy_level = 2000 self.senseRange = self.scale * 2 self.learning = learning if learning == RL.QLearn: self.brain = qlearning(actions=Action.actions, epsilon=0) elif learning == RL.SARSA: self.brain = sarsa(actions=Action.actions, epsilon=0) else: self.brain = None
n= 10000 number_of_runs = 5 for r in range(number_of_runs): for epsilon in epsilon_values: print(epsilon) Q, average_reward, max_reward, all_rewards, _ = double_sarsa(gw,n , epsilon=epsilon) average_reward_double_sarsa.append(average_reward) all_rewards_per_episode_double_sarsa.append(all_rewards) Q, average_reward, max_reward, all_rewards, _ = expected_sarsa(gw, n, epsilon=epsilon) average_reward_expected_sarsa.append(average_reward) all_rewards_per_episode_expected_sarsa.append(all_rewards) Q, average_reward, max_reward, all_rewards, _ = double_expected_sarsa(gw, n, epsilon=epsilon) average_reward_double_expected_sarsa.append(average_reward) all_rewards_per_episode_double_expected_sarsa.append(all_rewards) Q, average_reward, max_reward, all_rewards, _ = sarsa(gw, n, epsilon=epsilon) average_reward_sarsa.append(average_reward) all_rewards_per_episode_sarsa.append(all_rewards) # TODO: plot all sarsa, expected_sarsa, double_Sarsa average_reward_double_sarsa = np.mean(np.split(np.array(average_reward_double_sarsa), number_of_runs), axis=0) average_reward_expected_sarsa = np.mean(np.split(np.array(average_reward_expected_sarsa), number_of_runs), axis=0) average_reward_double_expected_sarsa = np.mean(np.split(np.array(average_reward_double_expected_sarsa), number_of_runs), axis=0) average_reward_sarsa = np.mean(np.split(np.array(average_reward_sarsa), number_of_runs), axis=0) plt.plot(average_reward_double_sarsa, label="Double Sarsa") plt.plot(average_reward_expected_sarsa, label="Expected Sarsa") plt.plot(average_reward_double_expected_sarsa, label="Double Expected Sarsa") plt.plot(average_reward_sarsa, label="Sarsa") plt.ylabel('Average reward') plt.xlabel('epsilon')
Q, average_reward, max_reward, all_rewards, Q_variances = double_sarsa(gw, n, epsilon=epsilon, alpha=alpha) average_reward_double_sarsa.append(average_reward) all_rewards_per_episode_double_sarsa.append(all_rewards) q_var_double_sarsa.append(Q_variances) print("Done double sarsa") Q, average_reward, max_reward, all_rewards, Q_variances = expected_sarsa(gw, n, epsilon=epsilon, alpha=alpha) average_reward_expected_sarsa.append(average_reward) q_var_expected_sarsa.append(Q_variances) print("Done expected sarsa") all_rewards_per_episode_expected_sarsa.append(all_rewards) Q, average_reward, max_reward, all_rewards, Q_variances = double_expected_sarsa(gw, n, epsilon=epsilon, alpha=alpha) average_reward_double_expected_sarsa.append(average_reward) q_var_double_expected_sarsa.append(Q_variances) print("Done double expected sarsa") all_rewards_per_episode_double_expected_sarsa.append(all_rewards) Q, average_reward, max_reward, all_rewards, Q_variances = sarsa(gw, n, epsilon=epsilon, alpha=alpha) q_var_sarsa.append(Q_variances) average_reward_sarsa.append(average_reward) all_rewards_per_episode_sarsa.append(all_rewards) print("Done sarsa") n_step_sarsa_results = Parallel(n_jobs=-2, verbose=10)(delayed(n_step_sarsa)(gw, n, alpha, gamma, epsilon, n=4) for alpha in alphas) # Q, average_reward, max_reward, all_rewards, Q_variances = n_step_sarsa(gw, max_episode, epsilon=epsilon, alpha=alpha, n = n_step) for result in n_step_sarsa_results: average_reward_n_step_sarsa.append(result[1]) q_var_n_step_sarsa.append(result[4]) all_rewards_per_episode_n_step_sarsa.append(result[3]) print("Done nstep sarsa") n_step_expected_sarsa_results = Parallel(n_jobs=-2, verbose=10)(delayed(n_step_expected_sarsa)(gw, n, alpha, gamma, epsilon, n=4) for alpha in alphas) # Q, average_reward, max_reward, all_rewards, Q_variances = n_step_sarsa(gw, max_episode, epsilon=epsilon, alpha=alpha, n = n_step)
steps = np.zeros((runCount,epiCount)) for i in range(runCount): steps[i,:] = np.array(MCon(epiCount,alpha=0.5,epsilon=epsilon,initValue=5)) aveSteps = np.mean(steps, axis=0) t = range(1, aveSteps.shape[0]+1) labels += ["MC on-policy"] plot(t, aveSteps) draw() for di, epsilon in enumerate([0.1]): steps = np.zeros((runCount,epiCount)) for i in range(runCount): steps[i,:] = np.array(sarsa(epiCount,epsGreedyPolicy,policyParam=epsilon,alpha=0.5,initValue=5)) aveSteps = np.mean(steps, axis=0) t = range(1, aveSteps.shape[0]+1) labels += ["Sarsa"] plot(t, aveSteps) draw() for di, epsilon in enumerate([0.1]): steps = np.zeros((runCount,epiCount)) for i in range(runCount): steps[i,:] = np.array(Qlearning(epiCount,epsGreedyPolicy,policyParam=epsilon,alpha=0.5,initValue=5)) aveSteps = np.mean(steps, axis=0) t = range(1, aveSteps.shape[0]+1)
test_rewards[2] = [0, 2, 1] test_rewards[23] = [4, 3, 1] def transform_to_actions(f): actions = ["S", "E", "N", "W"] return np.array([actions[x] for x in f]) rewards = np.ones((5, 5)) for x, y, r in test_rewards: rewards[x][y] = r gw = GridWorld(5, test_rewards, terminal_states=[2, 23]) # print gw.T # Q, ave_reward, max_reward, rewards_per_episode, Q_variances = n_step_sarsa(gw, 20000, alpha=.1, n=10) Q, ave_reward, max_reward, rewards_per_episode, Q_variances = n_step_q_sigma( gw, 20000, alpha=.7, n=4) sarsa_Q, ave_reward, max_reward, rewards_per_episode, Q_variances = sarsa( gw, 20000, alpha=.1) print "REWARDS" print np.reshape(np.array(rewards), (5, 5)) print np.reshape(transform_to_actions(np.argmax(Q, 1)), (5, 5)) print "SARSA" print np.reshape(transform_to_actions(np.argmax(sarsa_Q, 1)), (5, 5))
import numpy as np import matplotlib.pyplot as plt from sarsa import sarsa from windy_setup import * # initialization initial_Q = np.zeros([70, 4]) initial_state = stateSpace.index([0, 3]) gamma = 1 alpha = 0.5 epsilon = 0.01 num_episodes = 170 action_str = ['left', 'up', 'right', 'down'] # using sarsa Q, steps, rewards = sarsa(initial_Q, initial_state, transition, num_episodes, gamma, alpha, epsilon) # plot episodes vs time steps, i.e., figure 6.3 episodes = [] for ep in range(num_episodes): episodes.extend([ep] * steps[ep]) fig = plt.figure() plt.plot(episodes) plt.xlabel('Time steps') plt.ylabel('Episodes') plt.show() #fig.savefig('windy.jpg') # print the optimal route actions = np.argmax(Q, axis=1) state = initial_state
expand_row.append(element) expand_track.append(expand_row) grid_map = np.array(expand_track).astype(object) # Extract start locations start_locs = list() indices = np.where(grid_map == 'S') for row, col in zip(indices[0], indices[1]): start_locs.append((row, col)) # Extract goal locations goal_locs = list() indices = np.where(grid_map == 'F') for row, col in zip(indices[0], indices[1]): goal_locs.append((row, col)) # Unpickle passed in q_value_map afile = open(q_value_map_init_filename, 'rb') q_value_map_init = pickle.load(afile) afile.close() # Run SARSA q_value_map = sarsa(grid_map, racetrack_height, racetrack_width, start_locs, learning_rate, discount_factor, epsilon, crash_type, racetrack_file, q_value_map_init) filename = "q_value_map_racetrack_%s_learning_rate_%f_discount_factor_%f_epsilon_%f_crash_type_%s.pkl" % ( racetrack_file, learning_rate, discount_factor, epsilon, crash_type) afile = open(filename, 'wb') pickle.dump(q_value_map, afile) afile.close()
reward_ = self._update_states() #print 'New State', self.old_state stats.episode_rewards[i_episode] += reward_ stats.episode_lengths[i_episode] = i if self.old_state == 15 or self.old_state == 0: print 'Ith episode, episode len', i_episode, i break free_energy_2 = self.update_action(update=True) diff = reward_ + self.discount_factor * free_energy_2 - free_energy_1 self._update_action_weights(diff) self._update_state_weights(diff) return stats ''' rbm = RBM(nagent=3, nstate=16, nhid=20, naction=4) stats1 = rbm.gibbs_sampling(100) rbm = RBM(nagent=3, nstate=16, nhid=50, naction=4) stats2 = rbm.gibbs_sampling(100) ''' rbm = RBM(nagent=1, nstate=16, nhid=100, naction=4) stats3 = rbm.gibbs_sampling(100) Q, stats4 = sarsa(rbm.env, 100) plotting.plot_episode_stats(stats3, stats4) import ipdb ipdb.set_trace()
import gridWorld as gw import mountainCar as mc import actor_critic import sarsa import q_learning import reinforce import matplotlib.pyplot as plt import numpy as np # lambd - [0, 0.7] beyond 0.7 large variance gw_sarsa_returns = np.array([]) for ii in range(100): print("Step " + str(ii) + " in progress...") d = gw.gridWorld(randomSeed = ii) q = sarsa.sarsa(0.2, 0.75, 0.0) a = q.run_tabular(d, initScale=4) gw_sarsa_returns = np.append(gw_sarsa_returns, a) dataArr = gw_sarsa_returns plotValues = np.zeros((100,100)) for ii in range(100): plotValues[ii] = dataArr[100*ii:100*(ii+1)] meanValues1 = np.mean(plotValues, axis=0) stdValues1 = np.std(plotValues, axis=0) print("The mean is " + str(np.mean(meanValues1[50:]))) plt.errorbar(np.arange(1,101), meanValues1, yerr=stdValues1, color='red', ecolor='green', alpha=0.5) # plt.plot(np.arange(1,101), 3.5*np.ones((100, )), color='blue', alpha=1.0) plt.xlabel("Number of episodes")
import numpy as np import matplotlib.pyplot as plt from sarsa import sarsa from windy_setup import * # initialization initial_Q = np.zeros([70, 4]) initial_state = stateSpace.index([0, 3]) gamma = 1 alpha = 0.5 epsilon = 0.01 num_episodes = 170 action_str = ['left', 'up', 'right', 'down'] # using sarsa sarsa(initial_Q, initial_state, transition, num_episodes, gamma, alpha, epsilon)