Exemplo n.º 1
0
def solve(world, goalInWorld):
    solution = GridWorld(world.width, world.height)
    # print(solution.cells)
    solution.cells = [Cell(cell) for cell in world.cells]
    # print(solution.cells)
    goal = solution.get(goalInWorld.col, goalInWorld.line)
    goal.cost = 0
    closed = []
    opened = [goal]

    # reopen = 0

    while len(opened):
        # print(enigmaAsStr(solution, goal))
        # print("opened:", [(c.col, c.line) for c in opened])
        cell = opened.pop()
        closed.append(cell)

        # for adj in solution.getAdjacentCells(cell):
        for adj in solution.getAccessibleCells(cell):
            # print("cell", cell, "has got a adj", adj)
            if adj.reachable:  # we ignore obstacles
                direction = Direction.fromTo(adj, cell)
                cost = cell.cost + direction.cost()
                if adj.cost == -1:  # or adj.cost > cost:  # if not used yet
                    adj.direction = direction
                    adj.cost = cost
                    opened.append(adj)
                if adj.cost > cost:  # if not used yet
                    # reopen += 1
                    # print("reopen", reopen)
                    adj.direction = direction
                    adj.cost = cost
                    opened.append(adj)
    return solution
Exemplo n.º 2
0
def main_for_a_star():
    grid_world = GridWorld(40, 40)

    Functions.create_obstacles_from_hex(grid_world)
    # Functions.create_random_obstacles(grid_world, 0.205)
    # Functions.create_fixed_obstacles(grid_world, 6)
    grid_world.scan_grid_and_generate_graph()
    grid_world.print_graph()
    graph_hex = grid_world.save_graph()
    grid_world.create_grid_ui(grid_world.m, grid_world.n,
                              (grid_world.start_x, grid_world.start_y),
                              (grid_world.end_x, grid_world.end_y),
                              grid_world.obstacles)
    best_path_length = run_a_star(grid_world)
    for r in grid_world.a_star_route:
        color = r[1]
        if color == grid_world.color_visited:
            grid_world.a_star_visited_count += 1
        if color == grid_world.color_final_path2:
            grid_world.a_star_opened_count += 1
    print(best_path_length, grid_world.a_star_visited_count,
          grid_world.a_star_opened_count)
    grid_world.dfs_route = []
    grid_world.move_on_given_route_a_star()
    tk.mainloop()
Exemplo n.º 3
0
    def openMDPGUI(self):
        global w, g
        if self.checkSettingValues():
            self.master.destroy()

            df = float(self.discFactor.get())
            rews = list(map(lambda x: float(x.get()), self.rewValue))
            probs = list(map(lambda x: float(x.get()), self.probValue))

            w = GridWorld([[
                GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID,
                GridWorld.CELL_EXIT
            ],
                           [
                               GridWorld.CELL_VOID, GridWorld.CELL_WALL,
                               GridWorld.CELL_VOID, GridWorld.CELL_PIT
                           ],
                           [
                               GridWorld.CELL_VOID, GridWorld.CELL_VOID,
                               GridWorld.CELL_VOID, GridWorld.CELL_VOID
                           ]])
            w.setDiscountFactor(df)
            w.setRewards(rews[0], rews[1], rews[2])
            w.setProbabilities(probs[0], probs[1], probs[2], probs[3])

            g = MDPGUI(w)
Exemplo n.º 4
0
def objectiveFunction(args):

    learning_rate, min_epsilon, max_epsilon, epsilon_decay, discount_factor = args

    num_of_episodes = 500
    max_steps = 1000

    environment = GridWorld()

    agentQ = Q_Agent(environment,
                     epsilon=max_epsilon,
                     learning_rate=learning_rate,
                     discount_factor=discount_factor)

    train(environment,
          agentQ,
          episodes=num_of_episodes,
          max_steps_per_episode=max_steps,
          min_epsilon=min_epsilon,
          max_epsilon=max_epsilon,
          epsilon_decay=epsilon_decay)
    mean_reward = test(environment, agentQ, episodes=1000)

    value_map = np.zeros((environment.height, environment.width))
    for x in range(environment.height):
        for y in range(environment.width):
            q_values_of_state = agentQ.q_table[(x, y)]
            maxValue = max(q_values_of_state.values())
            value_map[x, y] = maxValue

    if save == True:
        utils.plotValueFunction(value_map,
                                os.path.join(save_path, 'heatmap.jpg'))

    return -(mean_reward)
Exemplo n.º 5
0
def baseTester():
    ''' runs a somewhat comprehensive test'''
    try:
        import QLearner as ql
    except:
        pass

    #it is worth noting here that num_states can be 100 for any grid < 10x10 using the tuckerHash
    #we need a new hash algo if we are to use a grid outside those parameters
    baseKwargs = {'num_states':100, 'alpha':1.0, 'gamma':0.9, 'rar':0.5, 'radr':0.99, 'dyna':0, 'verbose':False}
    '''
    if you want to add your own test, add it here. I use a tuple to indicate one test it is:
    (csv file, expected convergence iterations, kwarg modifier, test name)
    '''
    myTestList = [('testEasyWorld.csv', 800, 13,{}, 'easy test'),
                  ('world01.csv', 7000, 16, {}, 'Tucker Test 1'),
                  ('world02.csv', 7000, 17, {}, 'Tucker Test 2'),
                  ('testGridWorld.csv', 5000, 20, {}, 'Leo Base Test'),
                  ('testGridWorld.csv', 18000, 20, {'alpha':.2}, 'Test Learning Rate'),
                  ('testEasyWorld.csv', 700, 13, {'rar': 0.05}, 'Test Exploration'),
                  ('testEasyWorld.csv', 700, 13, {'radr': 0.8}, 'Test Exploration Decay'),
                  ('testGridWorld.csv', 3000, 20, {'gamma':0.8}, 'Test Discount Rate'),
                  ('testGridWorld.csv', 1100, 20, {'dyna':100}, 'Test Dyna'),
                  ]
    
    fdtest=myTestList[7:9]              
                  
    #for test in myTestList:
    for test in fdtest:             
        print '-------------------------------'
        print test[4]
        world = GridWorld(test[0])
        testKwargs = copy(baseKwargs)
        for k in test[3].keys():
            testKwargs[k] = test[3][k]
        print 'parameters %s' % str(testKwargs)
        learner = ql.QLearner(**testKwargs)
        print world.grid
        myTester = QTester(world, learner)
        nIter = test[1]
        totalIter = nIter
        lastPolicyLength = 0
        #someone let me know if there's a better way to check for convergence time
        while (totalIter < (test[1] * 1.4)):
           myTester.nIter(nIter)
           nIter = int(.05*test[1])
           myPolicy = myTester.getPolicy()
           policyLength = len(myPolicy)
           totalIter += nIter
           if (lastPolicyLength == policyLength) and (policyLength < 100):
              print 'converged in approx %i iterations' % totalIter
              print policyLength, myPolicy, test[2]
              break
           lastPolicyLength = policyLength
        if (test[1]*1.2 >= totalIter) and (policyLength == test[2]):
           print '*** TEST PASSED ***'
        else:
           print 'xxx TEST FAILED xxx'
Exemplo n.º 6
0
 def setUp(self):
     self.world = GridWorld(10, 10)
     self.obstaclesProb = 0.2
     self.world.addRandomObstacles(
         math.floor(self.world.getLength() * self.obstaclesProb))
     for cell in self.world.cells:
         if cell.reachable:
             self.goal = cell
             break
Exemplo n.º 7
0
def start_grid_mdp():
    """
    starts the program, restarts if the user wants to
    """
    grid = load_grid(get_file_path())
    world = GridWorld(grid)
    move_costs = get_move_cost()
    gamma = get_gamma()
    eval_steps = get_evaluation_steps()
    MDP(world, eval_steps, gamma, move_costs)
    if start_again():
        start_grid_mdp()
Exemplo n.º 8
0
 def createSmallMaze(self):
     #should be GridWorldSmall()
     self.GridWorldGame = GridWorld((5, 5))
     cols = self.GridWorldGame.size[0]
     rows = self.GridWorldGame.size[1]
     self.MAZE_X = cols * 32
     self.MAZE_Y = rows * 32
     FRAME = 8
     self.START_X = (self.MAX_X - cols *
                     32) / 2 + FRAME  #what happens if its not 0 in %32
     self.START_Y = (self.MAX_Y - rows * 32) / 2 + FRAME
     self.smileyPos = (self.START_X, self.START_Y)
Exemplo n.º 9
0
def main():
    env = GridWorld()
    _, es1, ts1 = independentQLearning(env, lambda x: x < 100, 0)
    qList, es2, ts2 = shareStateQLearning(env, lambda x: x < 100, 0)
    iQL = plt.scatter(es1, ts1, c='red')
    ssQL = plt.scatter(es2, ts2, c='blue')
    iQL.set_label("Independent")
    ssQL.set_label("5 Predators, 2 Prey, Share State")
    plt.xlabel("Episodes")
    plt.ylabel("Cumulative TimeSteps")
    plt.legend()
    plt.show()
    env.simulateTrajectory(qList)
Exemplo n.º 10
0
def evaluate(goals, EQ):
    env = GridWorld(goals=goals, T_states=T_states)
    policy = EQ_P(EQ)
    state = env.reset()
    done = False
    t = 0
    G = 0
    while not done and t < 100:
        action = policy[state]
        state_, reward, done, _ = env.step(action)
        state = state_
        G += reward
        t += 1
    return G
Exemplo n.º 11
0
    def buildBiasEngine(self):
        """ 
			Simulates MDPs with varying bias to build a bias inference engine.
		"""

        print "Loading MDPs...\n"

        # Unnecessary progress bar for terminal
        bar = pyprind.ProgBar(len(self.test))
        for i in self.test:
            self.sims.append(
                GridWorld(self.grid, i, self.discount, self.tau, self.epsilon))
            bar.update()

        print "\nDone loading MDPs..."
Exemplo n.º 12
0
 def setUp(self):
     self.n = 5
     self.p = 1
     self.gridworld = GridWorld(self.n, self.p)
     self.go_right_policy = np.ones(self.n * self.n, dtype=int)
     self.discount = 0.9
     self.large_discount = 0.2
     self.policy = np.array(
             [['TERMINAL', 'RIGHT', 'RIGHT', 'RIGHT', 'TERMINAL'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP'],
              ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'UP']])
     self.policy_large_discount = np.array(
             [['TERMINAL', 'LEFT', 'RIGHT', 'RIGHT', 'TERMINAL'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP'],
              ['UP', 'LEFT', 'RIGHT', 'RIGHT', 'UP']])
Exemplo n.º 13
0
 def __init__(self,
              epsilon=0.01,
              greedy=False,
              alpha=0.1,
              gamma=0.95,
              visual=True,
              goal=(10, 8),
              agentPose=(1, 1, 'up'),
              showTrial=True,
              randomReset=False,
              epsilonStrat=1,
              epsilonFactor=500):
     """
     gridWorld: GridWorld object
     epsilon: value used for epsilon greedy search
     alpha: step size
     gamma: discount favtor
     """
     self.actionValues = Counter()
     self.epsilonFactor = epsilonFactor
     self.randomReset = randomReset
     self.epsilon = epsilon
     self.greedy = greedy
     self.epsilonStrat = epsilonStrat
     self.goal = goal
     self.Q = dict()
     self.gridWorld = GridWorld(goal,
                                agentPose,
                                visual=visual,
                                showTrial=showTrial,
                                randomReset=randomReset)
     self.actions = self.gridWorld.getActions()
     self.Model = dict()
     self.alpha = alpha
     self.PriorityQueue = PriorityQueue()
     self.gamma = gamma
     self.exp = []
     self.rewards = dict()
     self.rewardNums = dict()
     self.predecessors = defaultdict(set)
     self.initQValues()
Exemplo n.º 14
0
from GridWorld import GridWorld
from GridWorld import GridWorldAdditive
from ValueIteration import ValueIteration

# Run Value Iteration in different Grid World environments
if __name__ == "__main__":
    gamma = 0.9
    print("Grid world Value Iteration with discounted rewards gamma = %.2f\n" % gamma)
    terminals = {(0, 3): +1, (1, 3): -1}
    gw = GridWorld((3, 4), 0.8, [(1, 1)], terminals)
    vi = ValueIteration()
    values = vi.valueIteration(gw, gamma)
    gw.printValues(values)
    qvalues = vi.getQValues(gw, values, gamma)
    gw.printQValues(qvalues)
    policy = vi.getPolicy(gw, values, gamma)
    gw.printPolicy(policy)

    reward = -0.01
    print("Grid world Value Iteration with additive rewards = %.2f\n" % reward)
    gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward)
    values = vi.valueIteration(gwa, 1, 100)
    gwa.printValues(values)
    qvalues = vi.getQValues(gwa, values, 1)
    gwa.printQValues(qvalues)
    policy = vi.getPolicy(gwa, values, 1)
    gwa.printPolicy(policy)
 
    reward = -0.04
    print("Grid World with additive rewards = %.2f\n" % reward)
    gwa = GridWorldAdditive((3, 4), 0.8, [(1, 1)], terminals, reward)
Exemplo n.º 15
0
from GridWorld import GridWorld


g = GridWorld(3,4)
policy={
    (0, 0):'R',
    (0, 1):'R',
    (0, 2):'R',
    (1, 0):'U',
    (1, 1):'U',
    (1, 2):'U',
    (1, 3):'U',
    (2, 0):'R',
    (2, 1):'R',
    (2, 2):'U',
    (2, 3):'L'
}

def print_policy(p,g):
    for r in range(g.row):
        print('------------------')
        for c in range(g.col):
            a = p.get((r,c),' ')
            print(' %s |'%a, end="")
        print("")

def print_value(V,g):
    for r in range(g.row):
        print('------------------')
        for c in range(g.col):
            v = V.get((r,c), 0)
Exemplo n.º 16
0
 def __init__(self):
     self.game = GridWorld( (5,5))
     self.squareCountGrid = self.game.createSquareCount()
     self.alpha = 0.1
     self.gamma = 0.9
Exemplo n.º 17
0
    vehState = start
    env_file = open("Environment.txt", "w")
    gridWorld = CreateEnvironment()
    gridWorld.create(env_file,
                     size_row='10',
                     size_col='10',
                     agent_row=str(vehState[0]),
                     agent_col=str(vehState[1]),
                     goal_row=str(goal[0]),
                     goal_col=str(goal[1]),
                     static_number='2',
                     static_list=[0, 3, 2, 4])
    env_file = open("Environment.txt", "r")
    text_in_file = env_file.readline()
    print(text_in_file)
    grid = GridWorld(text_in_file)
    gw = grid.gridDefine()
    #-------------------------------------------------------

    # initialize agent class and uav class
    Agent = agent(vehState)
    # define a model dictionary, which maps user inputs of learning model names to learning model function
    modelType = {
        "random": Agent.predict_Random,
        "standard": Agent.predict_Standard,
        "NN": Agent.predict_NN
    }
    UAV = uav(vehState)

    # initialize decision model (options = "random", "standard", or "NN")
    model = "random"  # will be a user input
Exemplo n.º 18
0
        self.drawUtilities(canvas)
        self.drawQValues(canvas)
        self.drawPolicy(canvas)


# ===========================================================================
# TEST
# ===========================================================================
if __name__ == '__main__':
    w = GridWorld([[
        GridWorld.CELL_VOID, GridWorld.CELL_VOID, GridWorld.CELL_VOID,
        GridWorld.CELL_EXIT
    ],
                   [
                       GridWorld.CELL_VOID, GridWorld.CELL_WALL,
                       GridWorld.CELL_VOID, GridWorld.CELL_PIT
                   ],
                   [
                       GridWorld.CELL_VOID, GridWorld.CELL_VOID,
                       GridWorld.CELL_VOID, GridWorld.CELL_VOID
                   ]],
                  discountFactor=1)
    w.setRewards(-0.04, -1, 1)
    w.setProbabilities(0.8, 0.1, 0.1, 0)
    print("GridWorld-----------")
    print(w)
    print("----------------")

    print("\nPolicy----------")
    p = Policy(w)
Exemplo n.º 19
0
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

from GridWorld import GridWorld
from Agents import RandomAgent
from TOMnet import CharacterNet, TOMNet
''' initializing the gridworld and the agent'''
Height = 11
Width = 11
Num_Goals = 4

sample_world = GridWorld(Height, Width, Num_Goals)
sample_world.generateWalls()
sample_world.generateGoals()

random_agent = RandomAgent(sample_world)
sample_world.addAgent(random_agent)
# sample_world.showImage()
''' initializing ToM network '''
batch_size = 7
input_channel = 11  # 5+K -> 1 wall, 4 goals, 1 agent, 5 action
height = 11
width = 11
hidden_channel_cn = 8
hidden_channel_tn = 32
output_size = 2
input_size = (batch_size, input_channel, height, width)
Exemplo n.º 20
0
        return -1
    grid_world.is_visited[x][y] = 1
    grid_world.dfs_route.append((x, y))
    random.shuffle(adjacent_nodes)
    for l in adjacent_nodes:
        if grid_world.is_visited[l[0]][l[1]] == 0:
            ret_val = random_dfs(grid_world, str(l[0]) + "," + str(l[1]))
            if ret_val == -1:
                grid_world.dfs_best_route.append((l[0], l[1]))
                return -1


def run_dfs(grid_world):
    # dfs(grid_world, grid_world.start_key)
    random_dfs(grid_world, grid_world.start_key)
    grid_world.dfs_best_route.append((grid_world.start_x, grid_world.start_y))
    grid_world.dfs_best_route = grid_world.dfs_best_route[::-1]


grid_world = GridWorld()
Functions.create_obstacles_from_hex(grid_world)
# Functions.create_random_obstacles(grid_world, 0.205)
# Functions.create_fixed_obstacles(grid_world, 6)
grid_world.scan_grid_and_generate_graph()
grid_world.print_graph()
grid_world.create_grid_ui(grid_world.m, grid_world.n, (grid_world.start_x, grid_world.start_y),
                          (grid_world.end_x, grid_world.end_y), grid_world.obstacles)
run_dfs(grid_world)
grid_world.move_on_given_route()
tk.mainloop()
Exemplo n.º 21
0
            actual_best = np.argmax(rewards)

            if curr_best != actual_best:
                is_stable = False

            self.policy[i] = np.eye(self.env.nA)[actual_best]

        return is_stable

    def get_policy(self):
        return np.argmax(self.policy, axis=1)

    def update(self):
        self.sweeps += 1
        self.valueFunction = self.evaluate_policy(self.policy)
        return self.sweeps, self.update_policy()

    def get_action(self, state):
        return np.random.choice(np.arange(len(self.policy[state])),
                                p=self.policy[state])


if __name__ == "__main__":
    PIA = PolicyIteration(GridWorld())
    x = False
    while not x:
        y, x = PIA.update()

    print(PIA.valueFunction.reshape(8, 8))
Exemplo n.º 22
0
            # RL take action and get next state and reward
            _, next_state_index, reward, done = env.step(action)

            # RL choose action based on next state
            next_action = RL.choose_action(str(next_state_index))

            # RL learn from this transition (s, a, r, s, a) ==> Sarsa
            RL.learn(str(state), action, reward, str(next_state_index), next_action)

            # swap state and action
            state = next_state_index
            action = next_action

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = GridWorld()
    RL = Sarsa(actions=list(range(env.n_actions)))

    env.after(10000, update)
    env.mainloop()
    print(RL.q_table)
Exemplo n.º 23
0
            grid_world.aco_current_route = grid_world.aco_current_route[:-1]
            all_paths.append(grid_world.aco_current_route)
            grid_world.aco_current_route = []
            grid_world.is_visited = [[0] * grid_world.n
                                     for temp in range(grid_world.m)]
        current_best_path = get_current_best_path(all_paths)
        update_pheromone(all_paths)
        evaporation()
        best_path = get_best_path(best_path, current_best_path)
        print(i, len(best_path), len(current_best_path))
        if len(best_path) == 0:
            return
        grid_world.aco_best_route = best_path


grid_world = GridWorld(40, 40)
# Functions.create_grid_from_hex(grid_world)
Functions.create_random_obstacles(grid_world, 0.105)
# Functions.create_fixed_obstacles(grid_world, 6)
grid_world.scan_grid_and_generate_graph()
grid_world.print_graph()
grid_world.save_graph()

pheromone_table = dict()
init_pheromone(grid_world)

alpha = 2
beta = 5

run_aco(grid_world)
Exemplo n.º 24
0
from GridWorld import GridWorld
from Robot import Robot

env = GridWorld("grid-small.txt")
env.print_map()
gamma = 0.9

start = [0, 0]
agent = Robot(env, gamma)
epochs = 500
decay = 0.99
rvm_max_iter = 500
max_step = 1000
epsilon = 1
epsilon_threshold = 0.001
verbose = True
verbose_iteration = 1
steps, rewards = agent.learn(epochs, decay, rvm_max_iter, max_step, epsilon,  start, verbose, verbose_iteration)
path = agent.get_path(start)
print(path)
Exemplo n.º 25
0
from Evaluation import Evaluation
from GridWorld import GridWorld
from Learning import Learning

# グリッドワールドの大きさを指定
row = 5
column = 5

LearningAgentSpan = 10  # 学習エージェントの寿命
LearningTimes = 100  # 学習回数
P = 5  # 報酬
T = 10  # 遡る数

EvaluationAgentSpan = 10  # 評価エージェントの寿命
EvaluationTimes = 100  # 試行回数

grid_world = GridWorld(row, column)
grid_world.make_grid_world()

learning = Learning(grid_world.get_grid_world(), row, column)
learning.do_learning(LearningAgentSpan, LearningTimes, P, T)

evaluation = Evaluation(learning.get_grid_world(), row, column)
evaluation.evaluation(EvaluationAgentSpan, EvaluationTimes)
Exemplo n.º 26
0
def test():
    """
    Testing function : Training for a given game and 2 agents, and Battle between both agents
    """

    #############################################
    ################ Choose Game ################
    #############################################
    #game = Soccer()
    #game = RockPaperScissors()
    game = GridWorld()

    playerA, playerB = game.players()[0], game.players()[1]  #player ID

    #############################################
    ### Choose Player 1 and training opponent ###
    #############################################
    player1 = WoLF_PHC_Agent(game, playerA)
    opponent1 = WoLF_PHC_Agent(game, playerB)

    #############################################
    ### Choose Player 2 and training opponent ###
    #############################################
    player2 = Minimax_Q_Agent(game, playerB)
    opponent2 = Random_Agent(game, playerA)

    #############################################
    ############## Train Policies ###############
    #############################################
    nb_iterations = 500000
    timestamp = 1000

    start_time = time.time()
    policy1, policyb = Scheduler(game, nb_iterations, timestamp, player1,
                                 opponent1)
    print("Learning Time Player 1: ", time.time() - start_time)

    start_time = time.time()
    policyc, policy2 = Scheduler(game, nb_iterations, timestamp, opponent2,
                                 player2)
    print("Learning Time Player 2: ", time.time() - start_time)

    #policy 1 : distances between Nash 1 and 2
    optimal_Nash1_player0 = GridWorld_Nash1_Player0_Agent(game)
    optimal_Nash1_player0.compute_policy()
    optimal_Nash2_player0 = GridWorld_Nash2_Player0_Agent(game)
    optimal_Nash2_player0.compute_policy()
    d10 = []
    d20 = []
    for i in range(len(policy1)):
        d10.append(distance(policy1[i], optimal_Nash1_player0.pi))
        d20.append(distance(policy1[i], optimal_Nash2_player0.pi))

    #policy 2 : distances between Nash 1 and 2
    optimal_Nash1_player1 = GridWorld_Nash1_Player1_Agent(game)
    optimal_Nash1_player1.compute_policy()
    optimal_Nash2_player1 = GridWorld_Nash2_Player1_Agent(game)
    optimal_Nash2_player1.compute_policy()
    d11 = []
    d21 = []
    for i in range(len(policy2)):
        d11.append(distance(policy2[i], optimal_Nash1_player1.pi))
        d21.append(distance(policy2[i], optimal_Nash2_player1.pi))

    #plot :
    plt.plot(d10, 'b')
    plt.plot(d20, 'b--')
    plt.plot(d11, 'r')
    plt.plot(d21, 'r--')
    plt.show()

    #Battle :
    print("Battle")
    nbplay = 1000
    affrontement(game, policy1[-1], policy2[-1], nbplay)
    return (policy1, policy2)


#test()
Exemplo n.º 27
0
import numpy as np
from matplotlib import pyplot as plt
import deepdish as dd
from GridWorld import GridWorld
from library import *

env = GridWorld()
T_states = [(3, 3), (3, 9), (9, 3), (9, 9), (1, 1), (1, 2), (1, 3), (1, 4),
            (1, 5), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (11, 1), (11, 2),
            (11, 3), (11, 4), (11, 5), (11, 7), (11, 8), (11, 9), (11, 10),
            (2, 1), (3, 1), (4, 1), (5, 1), (7, 1), (8, 1), (9, 1), (10, 1),
            (2, 11), (3, 11), (4, 11), (5, 11), (6, 11), (8, 11), (9, 11),
            (10, 11), (11, 11)]

###################################### Qs
BTasksQ = [[t] for t in T_states]
###################################### EQs
Bases = []
n = int(np.ceil(np.log2(len(T_states))))
m = (2**n) / 2
for i in range(n):
    Bases.append([])
    b = False
    for j in range(0, 2**n):
        if j >= len(T_states):
            break
        if b:
            Bases[i].append(1)  #1=True=rmax
        else:
            Bases[i].append(0)  #0=False=rmin
        if (j + 1) % m == 0:
Exemplo n.º 28
0
def init_gridworld(random_player=False, random_mines=False, maze=False):
    global grid_world
    grid_world = GridWorld(random_player, random_mines, maze)
Exemplo n.º 29
0
import tensorflow as tf

from GridWorld import GridWorld

np.random.seed(20)
tf.set_random_seed(20)

MAX_EPISODE = 1000
MAX_EP_STEPS = 1000  # maximum time step in one episode
GAMMA = 0.9  # reward discount in TD error
lr_actor = 0.001
lr_critic = 0.01

grid_world_h = 5
grid_world_w = 5
env = GridWorld(grid_world_h, grid_world_w)

n_features = 2
n_actions = 4


class Actor(object):
    def __init__(self, sess, n_features, n_actions, lr=0.001):
        self.sess = sess
        self.state = tf.placeholder(tf.float32, [1, n_features], "state")
        self.action = tf.placeholder(tf.int32, None, "act")
        self.td_error = tf.placeholder(tf.float32, None, "td_error")

        with tf.variable_scope('Actor'):
            state_layer = tf.layers.dense(
                inputs=self.state,
Exemplo n.º 30
0
    t = 0
    G = 0
    while not done and t < 100:
        action = policy[state]
        state_, reward, done, _ = env.step(action)
        state = state_
        G += reward
        t += 1
    return G


for t in range(len(types)):
    print("type: ", t)

    # Learning universal bounds (min and max tasks)
    env = GridWorld(goals=T_states, dense_rewards=not types[t][0])
    EQ_max, _ = Goal_Oriented_Q_learning(env, maxiter=maxiter)

    env = GridWorld(goals=T_states,
                    goal_reward=-0.1,
                    dense_rewards=not types[t][0])
    EQ_min, _ = Goal_Oriented_Q_learning(env, maxiter=maxiter)

    # Learning base tasks and doing composed tasks
    goals = Bases[0]
    goals = [[pos, pos] for pos in goals]
    env = GridWorld(goals=goals,
                    dense_rewards=not types[t][0],
                    T_states=T_states if types[t][1] else goals)
    A, stats1 = Goal_Oriented_Q_learning(
        env, maxiter=maxiter, T_states=None if types[t][1] else T_states)