示例#1
0
def getMazeGrid():
    grid = [[' ', ' ', ' ', +1], ['#', '#', ' ', '#'], [' ', '#', ' ', ' '],
            [' ', '#', '#', ' '], ['S', ' ', ' ', ' ']]
    return Gridworld(grid)
示例#2
0
        return (old_val + self.learning_rate *
                (reward + self.discount_factor * next_value - old_val))

    def learn(self, old_state, new_state, action, reward):
        old_val = self.q_table[old_state][action]
        next_value = np.max(self.q_table[new_state])
        # print(old_state, action, reward, new_state)
        new_q_value = self.compute_new_q_value(old_val, reward, next_value)

        self.q_table[old_state][action] = new_q_value

    def print_values(self):
        states_x, states_y, _ = self.q_table.shape

        for state_x in range(states_x):
            for state_y in range(states_y):
                for a_id, a in enumerate(self.actions):
                    print("q(s_({},{}), {}) = {:.3f}".format(
                        state_x, state_y, a, self.q_table[state_x, state_y,
                                                          a_id]))
                print()


if __name__ == '__main__':
    world = Gridworld(3, 3, goal_position=(3, 3), traps=[(2, 2)])

    agent = QLearning(world)
    agent.train(episodes=10000)

    agent.print_values()
示例#3
0
				value_increment += self.alpha*20 # Reward 20 for reaching the goal
				reward += 20*self.gamma**count # Add to discounted sum of rewards
			else:
				value_increment -= self.alpha # Reward -1 everywhere else
				reward -= self.gamma**count # Add to discounted sum of rewards
			self.V[pos.coordinates()] += value_increment
			count += 1
			
			# Make the selected move
			pos = self.board.move(pos, chosen_move)
			
		return reward
	
if __name__ == "__main__":
	# Initialize the grid world with appropriate goal location, size, and obstacles
	gw = Gridworld(10,5,[(7,0),(7,1),(7,2)],(9,1))
	# Initialize the QLearning object
	g = TDLearning(gw)
	# Initialize a reward vector to see how reward evolves per episode
	reward = np.zeros(1000)
	# Visualize the policy before training occurs
	gw.visualize_world(Position(9,1),g.V)
	for i in range(0,len(reward)):
		if i == 10 or i == 100:
			# Visualize policy at 10 training steps and 100 training steps
			gw.visualize_world(Position(9,1),g.V)
		reward[i] = g.run_episode(gw.random_position(),Position(9,1))
	# Visualize policy  after full set of training steps
	gw.visualize_world(Position(9,1),g.V)
	# Plot reward over time
	plt.scatter(np.arange(len(reward)),reward)
import numpy as np

import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))

from plotting import plotPiV
from gridworld import Gridworld

# V = ...
# Q = ...
V_converged = False
w = Gridworld()
gamma = .8
# init V at random
V = np.random.rand(Gridworld.states.shape[0])
V_old = 0
while not V_converged:

    # compute Q function
    # ...
    Q = Gridworld.reward + gamma * V
    # compute V function
    # ...

    V_diff = V - V_old
    V_diff = np.sum(np.absolute(V_diff).flat)
    if V_diff < 0.01:
        V_converged = True
    V = V_old
# convert policy for plot
示例#5
0
        for w in range(0,len(content),2):
            if int(content[w].split(' ')[1]) == nextautostate:
                ind = w
                break

        agentstate_parent = copy.deepcopy(agentstate)        

        
if __name__ == '__main__':
    from gridworld import Gridworld

    nrows = 10
    ncols = 10
    nagents = 1
    initial = [88]
    targets = [[ncols+1]]
    obstacles = [34,44,45,54,55,64,47]
    moveobstacles = [68]

    regionkeys = {'pavement','gravel','grass','sand','deterministic'}
    regions = dict.fromkeys(regionkeys,{-1})
    regions['deterministic']= range(nrows*ncols)

    gwg = Gridworld(initial, nrows, ncols,nagents, targets, obstacles, moveobstacles,regions)
    gwg.render()
    gwg.draw_state_labels()
    beliefparts = 4
    beliefcons = 10
    fname = 'counterexample.txt'
    run_counterexample('counterexample.txt',gwg,beliefparts)
示例#6
0
def getCliffGrid2():
    grid = [[' ', ' ', ' ', ' ', ' '], [8, 'S', ' ', ' ', 10],
            [-100, -100, -100, -100, -100]]
    return Gridworld(grid)
                for p in pos:
                    state[p[0], p[1]] = 4
                state[self.env.end[0], self.env.end[1]] = 8

                plt.imshow(state, cmap='hot')
                plt.show()


world = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 1,
                   0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0],
                  [0, 0, 0, 9, 9, 9, 2, 2, 1,
                   0], [7, 0, 0, 9, 9, 9, 2, 8, 1, 0],
                  [0, 0, 0, 9, 9, 9, 2, 2, 1,
                   0], [0, 0, 0, 1, 1, 1, 2, 2, 1, 0],
                  [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]])

world_simple = np.array([[0, 0, 0, 1, 1, 1, 2, 2, 1, 0],
                         [0, 0, 0, 1, 1, 1, 2, 2, 1, 0],
                         [0, 0, 0, 1, 1, 1, 2, 2, 1, 0],
                         [7, 0, 0, 1, 1, 1, 2, 8, 1, 0],
                         [0, 0, 0, 1, 1, 1, 2, 2, 1, 0],
                         [0, 0, 0, 1, 1, 1, 2, 2, 1, 0],
                         [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]])

env = Gridworld(world_simple, (1, 0))
bot = Agent(env)

bot.train(100)

bot.play(1)
示例#8
0
def lspi():
    gridworld = Gridworld()

    gamma = .8
    beta = 1
    numEpisodes = 10
    beta_factor = 1 - 5 * numEpisodes / 10000

    # Sample the state space with different grid sizes
    X_1, X_2 = np.meshgrid(np.arange(.5, 7, 1),
                           np.arange(.5, 7, 1),
                           indexing='ij')
    X_1m, X_2m = np.meshgrid(np.arange(.5, 7, 0.5),
                             np.arange(.5, 7, 0.5),
                             indexing='ij')

    # initialize the policy randomly. should give for each state in s (nx2) a
    # random action of the form (r,a), where r ~ Unif[0,1] and a ~ Unif[0,2pi].
    # pi = lambda s: ...

    # samples from initial distribution n starting positions (you can start
    # with a random initialization in the entire gridworld)
    # initialDistribution = lambda n: ...

    converged = False

    # generate an ndgrid over the state space for the centers of the basis
    # functions
    X1, X2, A1, A2 = np.meshgrid(np.arange(.5, 7, 1), np.arange(.5, 7, 1),
                                 np.arange(-1, 2), np.arange(-1, 2))
    # NOTE: the policy returns the action in polar coordinates while the basis
    # functions use cartesian coordinates!!! You have to convert between these
    # representations.

    # matrix of the centers
    c = np.column_stack(
        (np.transpose(X1.flatten()), np.transpose(X2.flatten()),
         np.transpose(A1.flatten()), np.transpose(A2.flatten())))

    # number of basis functions
    # k = ...

    # initialize weights
    # w = ...

    # compute bandwiths with median trick
    bw = np.zeros(4)
    for i in range(4):
        dist = pdist(c[:, [i]])
        bw[i] = np.sqrt(np.median(dist**2)) * .4

    # feature function (making use of rbf)
    # feature = lambda x_: ...

    # time step
    t = 0

    # initialize A and b
    # A = ...
    # b = ...

    while not converged:
        # Policy evaluation
        # sample data
        s1, a, r, s2 = sampleData(gridworld, pi, initialDistribution,
                                  numEpisodes, 50)

        # compute actions in cartesian space
        ac1, ac2 = pol2cart(a[:, 0, np.newaxis], a[:, 1, np.newaxis])

        # compute PHI
        # PHI = ...

        # compute PPI
        # PPI = ...

        # update A and b
        # A = ...
        # b = ...

        # compute new w
        w_old = w
        # w = ...

        # Policy improvement
        # pi = ...

        beta = beta_factor * beta
        t = t + 1

        # Check for convergence
        if np.abs(w - w_old).sum() / len(w) < 0.05:
            converged = True

        print(t, ' - ', beta, ' - ', np.abs(w - w_old).sum() / len(w))

        ### plotting
        a = policy(np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1))),
                   feature, w, 0)

        ax1, ax2 = pol2cart(a[:, 0].reshape(-1, 1), a[:, 1].reshape(-1, 1))
        phi = rbf(
            np.hstack((X_1m.reshape(-1, 1), X_2m.reshape(-1, 1), ax1, ax2)), c,
            bw)
        Q = phi.dot(w)
        n_plot = len(X_1m)

        plot_a = np.hstack((ax1, ax2)).reshape((n_plot, n_plot, 2))
        plot_V = Q.reshape((n_plot, n_plot))

        plotPiV(plot_a, plot_V, vmin=-5, vmax=5, block=False)

    plotPiV(plot_a, plot_V, vmin=-5, vmax=5)
示例#9
0
from gridworld import Gridworld
from dqn_helpers import GoalQWrapper
from replay_buffer import ReplayBuffer
import numpy as np
import itertools
from visualization import visualize_all_values

env = Gridworld(10)
gpu_num = 1

dqn = GoalQWrapper(env, 'dqn', 0)
buffer = ReplayBuffer(100000)

steps_before_train = 1000
viz_freq = 1000
batch_size = 32

s = env.reset()
for time in itertools.count():

    a = np.random.randint(0, 4)
    sp, r, t, info = env.step(a)
    buffer.append(s, a, r, sp, t)
    s = sp
    if time < steps_before_train:
        continue

    s_batch, a_batch, r_batch, sp_batch, t_batch = buffer.sample(batch_size)
    g_batch, _, _, _, _ = buffer.sample(batch_size)
    loss = dqn.train_batch_goals(time, s_batch, a_batch, sp_batch, g_batch)
    print(time, loss)
示例#10
0
                plt.show()


world = np.array([
    [0, 0, 0 , 1, 1, 1, 2, 2, 1, 0],
    [0, 0, 0 , 1, 1, 1, 2, 2, 1, 0],
    [0, 0, 0 , 9, 9, 1, 2, 2, 1, 0],
    [7, 0, 0 , 9, 9, 1, 2, 8, 1, 0],
    [0, 0, 0, 9,
9, 1, 2, 2, 1, 0],
    [0, 0, 0 , 1, 1, 1, 2, 2, 1, 0],
    0, 0, 0 , 1, 1









, 1, 2, 2, 1, 0]
])


env = Gridworld(world, (1, 0))
bot = Agent(env)

bot.train(2000)

bot.play(1)
from gridworld import Gridworld

grid = Gridworld("../gw/3.txt")

for i in range(0, 4):
    for j in range(0, 3):
        print(str(i) + "," + str(j))
        print(grid.successors((i, j)))
示例#12
0
    def learn(self, old_state, new_state, action, reward):
        old_val = self.q_table[old_state][action]
        next_value = np.max(self.q_table[new_state])
        # print(old_state, action, reward, new_state)
        new_q_value = self.compute_new_q_value(old_val, reward, next_value)

        self.q_table[old_state][action] = new_q_value

    def print_values(self):
        height, width, _ = self.q_table.shape

        for r in range(1, height - 1):
            for c in range(1, width - 1):
                for a_id, a in enumerate(self.actions):
                    print("q(s{}{}, {}) = {:.3f}".format(
                        r, c, a, self.q_table[r, c, a_id]))
                print()


if __name__ == '__main__':
    from gridworld import Gridworld

    env = Gridworld(5, 5, goal_position=(1, 3), traps=[(2, 1)])
    env.render()

    agent = QLearning(env)
    agent.train(episodes=1000)

    agent.print_values()
示例#13
0
import numpy as np
import random
from gridworld import Gridworld
from agents import RandomAgent, VAgent, QAgent, softmax

if __name__ == "__main__":
    gridworld = Gridworld(4, 4, 4, 8, {7}, 13)
    #agent = RandomAgent(gridworld)
    #agent = VAgent(gridworld)
    agent = QAgent(gridworld)

    agent.train(1000, 0.01, 1, 0.98, softmax=True)
    #agent.train(10, 0.1, 0.9, 0.98, softmax = True)
    #agent.train(10, 0.1, 0.9, 0.98, softmax = True)
    print("Q after some learning:")
    print(sorted(agent.Q.items()))
    agent.plot_evaluation_data()
    #agent.env = Gridworld(4, 4, 4, 8, {6}, 13)
    #agent.evaluate(100)
示例#14
0
from gridworld import Gridworld
import pygame as pg
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from vicero.algorithms.reinforce import Reinforce

scale = 32
env = Gridworld(scale, width=4, height=4)

pg.init()
screen = pg.display.set_mode((scale * len(env.board[0]), scale * len(env.board)))
env.screen = screen
clock = pg.time.Clock()

def plot(history):
        plt.figure(2)
        plt.clf()
        durations_t = torch.FloatTensor(history)
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Duration')
        plt.plot(durations_t.numpy(), c='lightgray', linewidth=1)

        if len(durations_t) >= 100:
            means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
            means = torch.cat((torch.zeros(99), means))
示例#15
0
def getMyGrid():
    grid = [[2, ' ', ' ', ' ', 3], [' ', -2, ' ', -3, ' '],
            [' ', ' ', 'S', ' ', ' '], [' ', -1, ' ', -4, ' '],
            [1, ' ', ' ', ' ', 4]]
    return Gridworld(grid)
示例#16
0
    gridworld.colorRectangle(4, 1)
    gridworld.colorRectangle(4, 2)
    gridworld.colorRectangle(4, 3)
    gridworld.colorRectangle(4, 4)
    gridworld.colorRectangle(4, 5)
    gridworld.colorRectangle(4, 6)
    gridworld.colorRectangle(4, 7)
    gridworld.colorRectangle(4, 8)

    gridworld.plotLetter(4, 9, 'G')

sarsaEpisodeRewards = np.empty(maxEpisodes)
qLearningEpisodeRewards = np.empty(maxEpisodes)

for i in range(runs):
    gridworld = Gridworld(rows, columns, startX, startY, greedification, True, False, epsGreedy, getReward, isTerminalState, PolicyType.SARSA)
    currentEpisodeRewards = gridworld.runSimulation(maxEpisodes=maxEpisodes)

    # plotSpecialStates(gridworld)
    # gridworld.plotOptimalPath(title="SARSA - Optimal path, $\\epsilon$={}".format(epsGreedy))
    if i > 0:
        for j in range(maxEpisodes):
            sarsaEpisodeRewards[j] = (sarsaEpisodeRewards[j] + currentEpisodeRewards[j]) / 2
    else:
        sarsaEpisodeRewards = currentEpisodeRewards

for i in range(runs):
    gridworld = Gridworld(rows, columns, startX, startY, greedification, True, False, epsGreedy, getReward, isTerminalState, PolicyType.QLEARNING)
    currentEpisodeRewards = gridworld.runSimulation(maxEpisodes=maxEpisodes)

    # plotSpecialStates(gridworld)
示例#17
0
def getCliffGrid():
    grid = [[' ', ' ', ' ', ' ', ' '], ['S', ' ', ' ', ' ', 10],
            [-100, -100, -100, -100, -100]]
    return Gridworld(makeGrid(grid))
示例#18
0
Vim�UnDo�V��e ����s�A�Xyy�5o���SRbrm�!from gridworld import Gridworld----Y�on
_�����Y�O��5�_�����Y�PK�5�_�����Y�Pk�from gridworld import GridWorld5�_�����Y�Pm�from gridworld import Gridworld5�_�����Y�Pq�def main5�_�	����Y�P��
def main()5�_�	����Y�P��
def main()5�_�	"����Y�P��"def main(rows, cols, R_max, noise)5�_�
	����Y�P��    env = Gridworld5�_�	
����Y�P��    env = Gridworld()5�_�
����Y�P��    env = Gridworld()5�_�
-����Y�P��-    env = Gridworld(rows, cols, R_max, noise)5�_�
	����Y�P��	�	5�_�
����Y�P��    �5�_�
����Y�P��
if __name__==5�_�����Y�P��if __name__==""5�_�����Y�P��if __name__=="__main__"5�_�����Y�Q�    main5�_�!����Y�Q$�"    main(rows, cols, R_max, noise)5�_�����Y�Q`�    �5�_�
����
VY�Qh�
    max_episode = 1000    max_step = 2005�_�����

VY�R��    �
5�_�
����VY�g�
    �
5�_�����VY�g��5�_�����VY�g.�from value_iteration import �5�_�����VY�g1�    �5�_�����VY�g6�    agent = �5�_�����VY�g8�    agent = ValueIterationAgent5�_� *����VY�g?�+    agent = ValueIterationAgent(env, gamma)5�_�! ����VY�gC�    noise = 0.35�_� "! !����VY�gL�"    main(rows, cols, R_max, noise)5�_�!#"!����VY�gS�	 #def main(rows, cols, R_max, noise):5�_�"$#+����VY�gY� +    agent = ValueIterationAgent(env, gamma)5�_�#%$����VY�g]�"    agent.train5�_�$&%����VY�g]�"    agent.train()5�_�%'&����VY�gl�#�#5�_�&('����VY�gm�5�_�')(����VY�gq�5�_�(*)����VY�gq�    5�_�)+*����VY�gq�5�_�*,+����VY�gr	�5�_�+-,����VY�oj�!/from value_iteration import ValueIterationAgent5�_�,-����VY�om
�!from gridworld import Gridworld5�_�
����VY�F�
5�_�!����Y�Q(��'    print "env.n_state : ", env.n_state)    print "env.n_action : ", env.n_action        max_episode = 1000    max_step = 2005��
示例#19
0
def getDiscountGrid():
    grid = [[' ', ' ', ' ', ' ', ' '], [' ', '#', ' ', ' ', ' '],
            [' ', '#', 1, '#', 10], ['S', ' ', ' ', ' ', ' '],
            [-10, -10, -10, -10, -10]]
    return Gridworld(grid)
示例#20
0
from gridworld import Gridworld
from utils.visualize_grid import draw_gridworld

gw = Gridworld(10, 10, 0, 19)
gw.grid[1][1] = 1
gw.grid[1][2] = 1
gw.grid[1][3] = 1
gw.grid[1][4] = 1
print(gw.grid)
draw_gridworld(gw, gw.start, gw.goal, 0)