def getMazeGrid(): grid = [[' ', ' ', ' ', +1], ['#', '#', ' ', '#'], [' ', '#', ' ', ' '], [' ', '#', '#', ' '], ['S', ' ', ' ', ' ']] return Gridworld(grid)
return (old_val + self.learning_rate * (reward + self.discount_factor * next_value - old_val)) def learn(self, old_state, new_state, action, reward): old_val = self.q_table[old_state][action] next_value = np.max(self.q_table[new_state]) # print(old_state, action, reward, new_state) new_q_value = self.compute_new_q_value(old_val, reward, next_value) self.q_table[old_state][action] = new_q_value def print_values(self): states_x, states_y, _ = self.q_table.shape for state_x in range(states_x): for state_y in range(states_y): for a_id, a in enumerate(self.actions): print("q(s_({},{}), {}) = {:.3f}".format( state_x, state_y, a, self.q_table[state_x, state_y, a_id])) print() if __name__ == '__main__': world = Gridworld(3, 3, goal_position=(3, 3), traps=[(2, 2)]) agent = QLearning(world) agent.train(episodes=10000) agent.print_values()
value_increment += self.alpha*20 # Reward 20 for reaching the goal reward += 20*self.gamma**count # Add to discounted sum of rewards else: value_increment -= self.alpha # Reward -1 everywhere else reward -= self.gamma**count # Add to discounted sum of rewards self.V[pos.coordinates()] += value_increment count += 1 # Make the selected move pos = self.board.move(pos, chosen_move) return reward if __name__ == "__main__": # Initialize the grid world with appropriate goal location, size, and obstacles gw = Gridworld(10,5,[(7,0),(7,1),(7,2)],(9,1)) # Initialize the QLearning object g = TDLearning(gw) # Initialize a reward vector to see how reward evolves per episode reward = np.zeros(1000) # Visualize the policy before training occurs gw.visualize_world(Position(9,1),g.V) for i in range(0,len(reward)): if i == 10 or i == 100: # Visualize policy at 10 training steps and 100 training steps gw.visualize_world(Position(9,1),g.V) reward[i] = g.run_episode(gw.random_position(),Position(9,1)) # Visualize policy after full set of training steps gw.visualize_world(Position(9,1),g.V) # Plot reward over time plt.scatter(np.arange(len(reward)),reward)
import numpy as np import sys import os sys.path.append(os.path.join(sys.path[0], '..')) from plotting import plotPiV from gridworld import Gridworld # V = ... # Q = ... V_converged = False w = Gridworld() gamma = .8 # init V at random V = np.random.rand(Gridworld.states.shape[0]) V_old = 0 while not V_converged: # compute Q function # ... Q = Gridworld.reward + gamma * V # compute V function # ... V_diff = V - V_old V_diff = np.sum(np.absolute(V_diff).flat) if V_diff < 0.01: V_converged = True V = V_old # convert policy for plot
for w in range(0,len(content),2): if int(content[w].split(' ')[1]) == nextautostate: ind = w break agentstate_parent = copy.deepcopy(agentstate) if __name__ == '__main__': from gridworld import Gridworld nrows = 10 ncols = 10 nagents = 1 initial = [88] targets = [[ncols+1]] obstacles = [34,44,45,54,55,64,47] moveobstacles = [68] regionkeys = {'pavement','gravel','grass','sand','deterministic'} regions = dict.fromkeys(regionkeys,{-1}) regions['deterministic']= range(nrows*ncols) gwg = Gridworld(initial, nrows, ncols,nagents, targets, obstacles, moveobstacles,regions) gwg.render() gwg.draw_state_labels() beliefparts = 4 beliefcons = 10 fname = 'counterexample.txt' run_counterexample('counterexample.txt',gwg,beliefparts)
def getCliffGrid2(): grid = [[' ', ' ', ' ', ' ', ' '], [8, 'S', ' ', ' ', 10], [-100, -100, -100, -100, -100]] return Gridworld(grid)
for p in pos: state[p[0], p[1]] = 4 state[self.env.end[0], self.env.end[1]] = 8 plt.imshow(state, cmap='hot') plt.show() world = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [0, 0, 0, 9, 9, 9, 2, 2, 1, 0], [7, 0, 0, 9, 9, 9, 2, 8, 1, 0], [0, 0, 0, 9, 9, 9, 2, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]]) world_simple = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [7, 0, 0, 1, 1, 1, 2, 8, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]]) env = Gridworld(world_simple, (1, 0)) bot = Agent(env) bot.train(100) bot.play(1)
def lspi(): gridworld = Gridworld() gamma = .8 beta = 1 numEpisodes = 10 beta_factor = 1 - 5 * numEpisodes / 10000 # Sample the state space with different grid sizes X_1, X_2 = np.meshgrid(np.arange(.5, 7, 1), np.arange(.5, 7, 1), indexing='ij') X_1m, X_2m = np.meshgrid(np.arange(.5, 7, 0.5), np.arange(.5, 7, 0.5), indexing='ij') # initialize the policy randomly. should give for each state in s (nx2) a # random action of the form (r,a), where r ~ Unif[0,1] and a ~ Unif[0,2pi]. # pi = lambda s: ... # samples from initial distribution n starting positions (you can start # with a random initialization in the entire gridworld) # initialDistribution = lambda n: ... converged = False # generate an ndgrid over the state space for the centers of the basis # functions X1, X2, A1, A2 = np.meshgrid(np.arange(.5, 7, 1), np.arange(.5, 7, 1), np.arange(-1, 2), np.arange(-1, 2)) # NOTE: the policy returns the action in polar coordinates while the basis # functions use cartesian coordinates!!! You have to convert between these # representations. # matrix of the centers c = np.column_stack( (np.transpose(X1.flatten()), np.transpose(X2.flatten()), np.transpose(A1.flatten()), np.transpose(A2.flatten()))) # number of basis functions # k = ... # initialize weights # w = ... # compute bandwiths with median trick bw = np.zeros(4) for i in range(4): dist = pdist(c[:, [i]]) bw[i] = np.sqrt(np.median(dist**2)) * .4 # feature function (making use of rbf) # feature = lambda x_: ... # time step t = 0 # initialize A and b # A = ... # b = ... while not converged: # Policy evaluation # sample data s1, a, r, s2 = sampleData(gridworld, pi, initialDistribution, numEpisodes, 50) # compute actions in cartesian space ac1, ac2 = pol2cart(a[:, 0, np.newaxis], a[:, 1, np.newaxis]) # compute PHI # PHI = ... # compute PPI # PPI = ... # update A and b # A = ... # b = ... # compute new w w_old = w # w = ... # Policy improvement # pi = ... beta = beta_factor * beta t = t + 1 # Check for convergence if np.abs(w - w_old).sum() / len(w) < 0.05: converged = True print(t, ' - ', beta, ' - ', np.abs(w - w_old).sum() / len(w)) ### plotting a = policy(np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1))), feature, w, 0) ax1, ax2 = pol2cart(a[:, 0].reshape(-1, 1), a[:, 1].reshape(-1, 1)) phi = rbf( np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1), ax1, ax2)), c, bw) Q = phi.dot(w) n_plot = len(X_1m) plot_a = np.hstack((ax1, ax2)).reshape((n_plot, n_plot, 2)) plot_V = Q.reshape((n_plot, n_plot)) plotPiV(plot_a, plot_V, vmin=-5, vmax=5, block=False) plotPiV(plot_a, plot_V, vmin=-5, vmax=5)
from gridworld import Gridworld from dqn_helpers import GoalQWrapper from replay_buffer import ReplayBuffer import numpy as np import itertools from visualization import visualize_all_values env = Gridworld(10) gpu_num = 1 dqn = GoalQWrapper(env, 'dqn', 0) buffer = ReplayBuffer(100000) steps_before_train = 1000 viz_freq = 1000 batch_size = 32 s = env.reset() for time in itertools.count(): a = np.random.randint(0, 4) sp, r, t, info = env.step(a) buffer.append(s, a, r, sp, t) s = sp if time < steps_before_train: continue s_batch, a_batch, r_batch, sp_batch, t_batch = buffer.sample(batch_size) g_batch, _, _, _, _ = buffer.sample(batch_size) loss = dqn.train_batch_goals(time, s_batch, a_batch, sp_batch, g_batch) print(time, loss)
plt.show() world = np.array([ [0, 0, 0 , 1, 1, 1, 2, 2, 1, 0], [0, 0, 0 , 1, 1, 1, 2, 2, 1, 0], [0, 0, 0 , 9, 9, 1, 2, 2, 1, 0], [7, 0, 0 , 9, 9, 1, 2, 8, 1, 0], [0, 0, 0, 9, 9, 1, 2, 2, 1, 0], [0, 0, 0 , 1, 1, 1, 2, 2, 1, 0], 0, 0, 0 , 1, 1 , 1, 2, 2, 1, 0] ]) env = Gridworld(world, (1, 0)) bot = Agent(env) bot.train(2000) bot.play(1)
from gridworld import Gridworld grid = Gridworld("../gw/3.txt") for i in range(0, 4): for j in range(0, 3): print(str(i) + "," + str(j)) print(grid.successors((i, j)))
def learn(self, old_state, new_state, action, reward): old_val = self.q_table[old_state][action] next_value = np.max(self.q_table[new_state]) # print(old_state, action, reward, new_state) new_q_value = self.compute_new_q_value(old_val, reward, next_value) self.q_table[old_state][action] = new_q_value def print_values(self): height, width, _ = self.q_table.shape for r in range(1, height - 1): for c in range(1, width - 1): for a_id, a in enumerate(self.actions): print("q(s{}{}, {}) = {:.3f}".format( r, c, a, self.q_table[r, c, a_id])) print() if __name__ == '__main__': from gridworld import Gridworld env = Gridworld(5, 5, goal_position=(1, 3), traps=[(2, 1)]) env.render() agent = QLearning(env) agent.train(episodes=1000) agent.print_values()
import numpy as np import random from gridworld import Gridworld from agents import RandomAgent, VAgent, QAgent, softmax if __name__ == "__main__": gridworld = Gridworld(4, 4, 4, 8, {7}, 13) #agent = RandomAgent(gridworld) #agent = VAgent(gridworld) agent = QAgent(gridworld) agent.train(1000, 0.01, 1, 0.98, softmax=True) #agent.train(10, 0.1, 0.9, 0.98, softmax = True) #agent.train(10, 0.1, 0.9, 0.98, softmax = True) print("Q after some learning:") print(sorted(agent.Q.items())) agent.plot_evaluation_data() #agent.env = Gridworld(4, 4, 4, 8, {6}, 13) #agent.evaluate(100)
from gridworld import Gridworld import pygame as pg import numpy as np import matplotlib.pyplot as plt import torch import torch.nn as nn import torch.nn.functional as F from vicero.algorithms.reinforce import Reinforce scale = 32 env = Gridworld(scale, width=4, height=4) pg.init() screen = pg.display.set_mode((scale * len(env.board[0]), scale * len(env.board))) env.screen = screen clock = pg.time.Clock() def plot(history): plt.figure(2) plt.clf() durations_t = torch.FloatTensor(history) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(durations_t.numpy(), c='lightgray', linewidth=1) if len(durations_t) >= 100: means = durations_t.unfold(0, 100, 1).mean(1).view(-1) means = torch.cat((torch.zeros(99), means))
def getMyGrid(): grid = [[2, ' ', ' ', ' ', 3], [' ', -2, ' ', -3, ' '], [' ', ' ', 'S', ' ', ' '], [' ', -1, ' ', -4, ' '], [1, ' ', ' ', ' ', 4]] return Gridworld(grid)
gridworld.colorRectangle(4, 1) gridworld.colorRectangle(4, 2) gridworld.colorRectangle(4, 3) gridworld.colorRectangle(4, 4) gridworld.colorRectangle(4, 5) gridworld.colorRectangle(4, 6) gridworld.colorRectangle(4, 7) gridworld.colorRectangle(4, 8) gridworld.plotLetter(4, 9, 'G') sarsaEpisodeRewards = np.empty(maxEpisodes) qLearningEpisodeRewards = np.empty(maxEpisodes) for i in range(runs): gridworld = Gridworld(rows, columns, startX, startY, greedification, True, False, epsGreedy, getReward, isTerminalState, PolicyType.SARSA) currentEpisodeRewards = gridworld.runSimulation(maxEpisodes=maxEpisodes) # plotSpecialStates(gridworld) # gridworld.plotOptimalPath(title="SARSA - Optimal path, $\\epsilon$={}".format(epsGreedy)) if i > 0: for j in range(maxEpisodes): sarsaEpisodeRewards[j] = (sarsaEpisodeRewards[j] + currentEpisodeRewards[j]) / 2 else: sarsaEpisodeRewards = currentEpisodeRewards for i in range(runs): gridworld = Gridworld(rows, columns, startX, startY, greedification, True, False, epsGreedy, getReward, isTerminalState, PolicyType.QLEARNING) currentEpisodeRewards = gridworld.runSimulation(maxEpisodes=maxEpisodes) # plotSpecialStates(gridworld)
def getCliffGrid(): grid = [[' ', ' ', ' ', ' ', ' '], ['S', ' ', ' ', ' ', 10], [-100, -100, -100, -100, -100]] return Gridworld(makeGrid(grid))
Vim�UnDo�V��e ����s�A�Xyy�5o���SRbrm�!from gridworld import Gridworld----Y�on _�����Y�O��5�_�����Y�PK�5�_�����Y�Pk�from gridworld import GridWorld5�_�����Y�Pm�from gridworld import Gridworld5�_�����Y�Pq�def main5�_� ����Y�P�� def main()5�_� ����Y�P�� def main()5�_� "����Y�P��"def main(rows, cols, R_max, noise)5�_� ����Y�P�� env = Gridworld5�_� ����Y�P�� env = Gridworld()5�_� ����Y�P�� env = Gridworld()5�_� -����Y�P��- env = Gridworld(rows, cols, R_max, noise)5�_� ����Y�P�� � 5�_� ����Y�P�� �5�_� ����Y�P�� if __name__==5�_�����Y�P��if __name__==""5�_�����Y�P��if __name__=="__main__"5�_�����Y�Q� main5�_�!����Y�Q$�" main(rows, cols, R_max, noise)5�_�����Y�Q`� �5�_� ���� VY�Qh� max_episode = 1000 max_step = 2005�_����� VY�R�� � 5�_� ����VY�g� � 5�_�����VY�g��5�_�����VY�g.�from value_iteration import �5�_�����VY�g1� �5�_�����VY�g6� agent = �5�_�����VY�g8� agent = ValueIterationAgent5�_� *����VY�g?�+ agent = ValueIterationAgent(env, gamma)5�_�! ����VY�gC� noise = 0.35�_� "! !����VY�gL�" main(rows, cols, R_max, noise)5�_�!#"!����VY�gS� #def main(rows, cols, R_max, noise):5�_�"$#+����VY�gY� + agent = ValueIterationAgent(env, gamma)5�_�#%$����VY�g]�" agent.train5�_�$&%����VY�g]�" agent.train()5�_�%'&����VY�gl�#�#5�_�&('����VY�gm�5�_�')(����VY�gq�5�_�(*)����VY�gq� 5�_�)+*����VY�gq�5�_�*,+����VY�gr �5�_�+-,����VY�oj�!/from value_iteration import ValueIterationAgent5�_�,-����VY�om �!from gridworld import Gridworld5�_� ����VY�F� 5�_�!����Y�Q(��' print "env.n_state : ", env.n_state) print "env.n_action : ", env.n_action max_episode = 1000 max_step = 2005��
def getDiscountGrid(): grid = [[' ', ' ', ' ', ' ', ' '], [' ', '#', ' ', ' ', ' '], [' ', '#', 1, '#', 10], ['S', ' ', ' ', ' ', ' '], [-10, -10, -10, -10, -10]] return Gridworld(grid)
from gridworld import Gridworld from utils.visualize_grid import draw_gridworld gw = Gridworld(10, 10, 0, 19) gw.grid[1][1] = 1 gw.grid[1][2] = 1 gw.grid[1][3] = 1 gw.grid[1][4] = 1 print(gw.grid) draw_gridworld(gw, gw.start, gw.goal, 0)