Пример #1
0
 def __init__(self, img, scale, speed, learning):
     Animat.__init__(self, img, scale, speed)
     self.id = 1
     self.energy_level = 2000
     self.senseRange = self.scale * 2
     self.learning = learning
     if learning == RL.QLearn:
         self.brain = qlearning(actions=Action.actions, epsilon=0)
     elif learning == RL.SARSA:
         self.brain = sarsa(actions=Action.actions, epsilon=0)
     else:
         self.brain = None
n= 10000
number_of_runs = 5

for r in range(number_of_runs):
    for epsilon in epsilon_values:
        print(epsilon)
        Q, average_reward, max_reward, all_rewards, _ = double_sarsa(gw,n , epsilon=epsilon)
        average_reward_double_sarsa.append(average_reward)
        all_rewards_per_episode_double_sarsa.append(all_rewards)
        Q, average_reward, max_reward, all_rewards, _ = expected_sarsa(gw, n, epsilon=epsilon)
        average_reward_expected_sarsa.append(average_reward)
        all_rewards_per_episode_expected_sarsa.append(all_rewards)
        Q, average_reward, max_reward, all_rewards, _ = double_expected_sarsa(gw, n, epsilon=epsilon)
        average_reward_double_expected_sarsa.append(average_reward)
        all_rewards_per_episode_double_expected_sarsa.append(all_rewards)
        Q, average_reward, max_reward, all_rewards, _ = sarsa(gw, n, epsilon=epsilon)
        average_reward_sarsa.append(average_reward)
        all_rewards_per_episode_sarsa.append(all_rewards)

# TODO: plot all sarsa, expected_sarsa, double_Sarsa
average_reward_double_sarsa = np.mean(np.split(np.array(average_reward_double_sarsa), number_of_runs), axis=0)
average_reward_expected_sarsa = np.mean(np.split(np.array(average_reward_expected_sarsa), number_of_runs), axis=0)
average_reward_double_expected_sarsa = np.mean(np.split(np.array(average_reward_double_expected_sarsa), number_of_runs), axis=0)
average_reward_sarsa = np.mean(np.split(np.array(average_reward_sarsa), number_of_runs), axis=0)

plt.plot(average_reward_double_sarsa, label="Double Sarsa")
plt.plot(average_reward_expected_sarsa, label="Expected Sarsa")
plt.plot(average_reward_double_expected_sarsa, label="Double Expected Sarsa")
plt.plot(average_reward_sarsa, label="Sarsa")
plt.ylabel('Average reward')
plt.xlabel('epsilon')
Пример #3
0
        Q, average_reward, max_reward, all_rewards, Q_variances = double_sarsa(gw, n, epsilon=epsilon, alpha=alpha)
        average_reward_double_sarsa.append(average_reward)
        all_rewards_per_episode_double_sarsa.append(all_rewards)
        q_var_double_sarsa.append(Q_variances)
        print("Done double sarsa")
        Q, average_reward, max_reward, all_rewards, Q_variances = expected_sarsa(gw, n, epsilon=epsilon, alpha=alpha)
        average_reward_expected_sarsa.append(average_reward)
        q_var_expected_sarsa.append(Q_variances)
        print("Done expected sarsa")
        all_rewards_per_episode_expected_sarsa.append(all_rewards)
        Q, average_reward, max_reward, all_rewards, Q_variances = double_expected_sarsa(gw, n, epsilon=epsilon, alpha=alpha)
        average_reward_double_expected_sarsa.append(average_reward)
        q_var_double_expected_sarsa.append(Q_variances)
        print("Done double expected sarsa")
        all_rewards_per_episode_double_expected_sarsa.append(all_rewards)
        Q, average_reward, max_reward, all_rewards, Q_variances = sarsa(gw, n, epsilon=epsilon, alpha=alpha)
        q_var_sarsa.append(Q_variances)
        average_reward_sarsa.append(average_reward)
        all_rewards_per_episode_sarsa.append(all_rewards)
        print("Done  sarsa")

    n_step_sarsa_results = Parallel(n_jobs=-2, verbose=10)(delayed(n_step_sarsa)(gw, n, alpha, gamma, epsilon, n=4) for alpha in alphas)
    # Q, average_reward, max_reward, all_rewards, Q_variances = n_step_sarsa(gw, max_episode, epsilon=epsilon, alpha=alpha, n = n_step)
    for result in n_step_sarsa_results:
        average_reward_n_step_sarsa.append(result[1])
        q_var_n_step_sarsa.append(result[4])
        all_rewards_per_episode_n_step_sarsa.append(result[3])
    print("Done nstep sarsa")

    n_step_expected_sarsa_results = Parallel(n_jobs=-2, verbose=10)(delayed(n_step_expected_sarsa)(gw, n, alpha, gamma, epsilon, n=4) for alpha in alphas)
    # Q, average_reward, max_reward, all_rewards, Q_variances = n_step_sarsa(gw, max_episode, epsilon=epsilon, alpha=alpha, n = n_step)
Пример #4
0
	steps = np.zeros((runCount,epiCount))
	for i in range(runCount):
		steps[i,:] = np.array(MCon(epiCount,alpha=0.5,epsilon=epsilon,initValue=5))		
	aveSteps = np.mean(steps, axis=0)

	t = range(1, aveSteps.shape[0]+1)
	labels += ["MC on-policy"]
	plot(t, aveSteps)
	draw()


for di, epsilon in enumerate([0.1]):
	
	steps = np.zeros((runCount,epiCount))
	for i in range(runCount):
		steps[i,:] = np.array(sarsa(epiCount,epsGreedyPolicy,policyParam=epsilon,alpha=0.5,initValue=5))		
	aveSteps = np.mean(steps, axis=0)

	t = range(1, aveSteps.shape[0]+1)
	labels += ["Sarsa"]
	plot(t, aveSteps)
	draw()

for di, epsilon in enumerate([0.1]):
	
	steps = np.zeros((runCount,epiCount))
	for i in range(runCount):
		steps[i,:] = np.array(Qlearning(epiCount,epsGreedyPolicy,policyParam=epsilon,alpha=0.5,initValue=5))		
	aveSteps = np.mean(steps, axis=0)

	t = range(1, aveSteps.shape[0]+1)
Пример #5
0
test_rewards[2] = [0, 2, 1]
test_rewards[23] = [4, 3, 1]


def transform_to_actions(f):
    actions = ["S", "E", "N", "W"]
    return np.array([actions[x] for x in f])


rewards = np.ones((5, 5))
for x, y, r in test_rewards:
    rewards[x][y] = r

gw = GridWorld(5, test_rewards, terminal_states=[2, 23])

# print gw.T

# Q, ave_reward, max_reward, rewards_per_episode, Q_variances = n_step_sarsa(gw, 20000, alpha=.1, n=10)
Q, ave_reward, max_reward, rewards_per_episode, Q_variances = n_step_q_sigma(
    gw, 20000, alpha=.7, n=4)
sarsa_Q, ave_reward, max_reward, rewards_per_episode, Q_variances = sarsa(
    gw, 20000, alpha=.1)

print "REWARDS"
print np.reshape(np.array(rewards), (5, 5))

print np.reshape(transform_to_actions(np.argmax(Q, 1)), (5, 5))

print "SARSA"
print np.reshape(transform_to_actions(np.argmax(sarsa_Q, 1)), (5, 5))
import numpy as np
import matplotlib.pyplot as plt
from sarsa import sarsa
from windy_setup import *

# initialization
initial_Q = np.zeros([70, 4])
initial_state = stateSpace.index([0, 3])
gamma = 1
alpha = 0.5
epsilon = 0.01
num_episodes = 170
action_str = ['left', 'up', 'right', 'down']

# using sarsa
Q, steps, rewards = sarsa(initial_Q, initial_state, transition, num_episodes,
                          gamma, alpha, epsilon)

# plot episodes vs time steps, i.e., figure 6.3
episodes = []
for ep in range(num_episodes):
    episodes.extend([ep] * steps[ep])
fig = plt.figure()
plt.plot(episodes)
plt.xlabel('Time steps')
plt.ylabel('Episodes')
plt.show()
#fig.savefig('windy.jpg')

# print the optimal route
actions = np.argmax(Q, axis=1)
state = initial_state
Пример #7
0
        expand_row.append(element)
    expand_track.append(expand_row)
grid_map = np.array(expand_track).astype(object)

# Extract start locations
start_locs = list()
indices = np.where(grid_map == 'S')
for row, col in zip(indices[0], indices[1]):
    start_locs.append((row, col))

# Extract goal locations
goal_locs = list()
indices = np.where(grid_map == 'F')
for row, col in zip(indices[0], indices[1]):
    goal_locs.append((row, col))

# Unpickle passed in q_value_map
afile = open(q_value_map_init_filename, 'rb')
q_value_map_init = pickle.load(afile)
afile.close()

# Run SARSA
q_value_map = sarsa(grid_map, racetrack_height, racetrack_width, start_locs,
                    learning_rate, discount_factor, epsilon, crash_type,
                    racetrack_file, q_value_map_init)
filename = "q_value_map_racetrack_%s_learning_rate_%f_discount_factor_%f_epsilon_%f_crash_type_%s.pkl" % (
    racetrack_file, learning_rate, discount_factor, epsilon, crash_type)
afile = open(filename, 'wb')
pickle.dump(q_value_map, afile)
afile.close()
                reward_ = self._update_states()
                #print 'New State', self.old_state
                stats.episode_rewards[i_episode] += reward_
                stats.episode_lengths[i_episode] = i

                if self.old_state == 15 or self.old_state == 0:
                    print 'Ith episode, episode len', i_episode, i
                    break
                free_energy_2 = self.update_action(update=True)
                diff = reward_ + self.discount_factor * free_energy_2 - free_energy_1
                self._update_action_weights(diff)
                self._update_state_weights(diff)
        return stats


'''
rbm = RBM(nagent=3, nstate=16, nhid=20, naction=4)
stats1 = rbm.gibbs_sampling(100)

rbm = RBM(nagent=3, nstate=16, nhid=50, naction=4)
stats2 = rbm.gibbs_sampling(100)
'''
rbm = RBM(nagent=1, nstate=16, nhid=100, naction=4)
stats3 = rbm.gibbs_sampling(100)

Q, stats4 = sarsa(rbm.env, 100)
plotting.plot_episode_stats(stats3, stats4)
import ipdb

ipdb.set_trace()
Пример #9
0
import gridWorld as gw
import mountainCar as mc
import actor_critic
import sarsa
import q_learning
import reinforce
import matplotlib.pyplot as plt
import numpy as np

# lambd - [0, 0.7] beyond 0.7 large variance

gw_sarsa_returns = np.array([])
for ii in range(100):
	print("Step " + str(ii) + " in progress...")
	d = gw.gridWorld(randomSeed = ii)
	q = sarsa.sarsa(0.2, 0.75, 0.0)
	a = q.run_tabular(d, initScale=4)
	gw_sarsa_returns = np.append(gw_sarsa_returns, a)


dataArr = gw_sarsa_returns
plotValues = np.zeros((100,100))
for ii in range(100):
	plotValues[ii] = dataArr[100*ii:100*(ii+1)]

meanValues1 = np.mean(plotValues, axis=0)
stdValues1 = np.std(plotValues, axis=0)
print("The mean is " + str(np.mean(meanValues1[50:])))
plt.errorbar(np.arange(1,101), meanValues1, yerr=stdValues1, color='red', ecolor='green', alpha=0.5)
# plt.plot(np.arange(1,101), 3.5*np.ones((100, )), color='blue', alpha=1.0)
plt.xlabel("Number of episodes")
Пример #10
0
import numpy as np
import matplotlib.pyplot as plt
from sarsa import sarsa
from windy_setup import *

# initialization
initial_Q = np.zeros([70, 4])
initial_state = stateSpace.index([0, 3])
gamma = 1
alpha = 0.5
epsilon = 0.01
num_episodes = 170
action_str = ['left', 'up', 'right', 'down']

# using sarsa
sarsa(initial_Q, initial_state, transition, num_episodes, gamma, alpha,
      epsilon)