def policy_evaluation(tot_policy, GAMMA, EPSILON):
    delta  = 1
    while(delta > EPSILON):
        delta = 0
        for row in range(BOARD_ROWS):
            for col in range(BOARD_COLS):
                state_value_old = V[row][col]
                for action in ACTIONS:
                    proba_act=tot_policy[grid2state_dict[(row,col)]][action]
                    new_state_value = 0.0
                    for choix, prob in zip(ACTIONS[action], p):
                        env = Environment(state = (row, col), deterministic=True)
                        next_state = env.nextPosition(choix)
                        #print("state = ({}, {})", row, col)
                        #print("action = " + choix)
                        #print("next-state = " + str(next_state))
                        #print()
                        reward = env.giveReward()
                        new_state_value += proba_act*prob * (reward + GAMMA * V[next_state[0]][next_state[1]])
                V[row][col] = new_state_value 
                v_change = np.abs(state_value_old -  new_state_value)
                delta = np.maximum(delta, v_change)
        print("\nDelta = " + str(delta) + "\n")
    print("V = " + str(V))
    return(V)
def q_from_v (V, state, GAMMA):
    q = {str(act):0 for act in ACTIONS}
    for action in q:
        for choix, prob in zip(ACTIONS[action], p):
            env = Environment(state = state, deterministic=True)
            next_state = env.nextPosition(choix)
            reward = env.giveReward()
            q[action] += prob * (reward + GAMMA * V[next_state[0]][next_state[1]])
    return q
Exemplo n.º 3
0
 def __init__(self):
     self.states = []
     self.actions = [UP, DOWN, LEFT, RIGHT]
     self.State = Environment()
     self.isEnd = self.State.isEnd
     self.lr = 0.2
     self.exp_rate = 0.3
     self.decay_gamma = 0.9
     
     # initial Q values
     self.Q_values = {}
     for i in range(BOARD_ROWS):
         for j in range(BOARD_COLS):
             self.Q_values[(i, j)] = {}
             for a in self.actions:
                 self.Q_values[(i, j)][a] = 0  # Q value is a dict of dict
Exemplo n.º 4
0
 def reset(self):
     self.states = []
     self.State = Environment()
     self.isEnd = self.State.isEnd
Exemplo n.º 5
0
class Agent:
    def __init__(self):
        self.states = []
        self.actions = [UP, DOWN, LEFT, RIGHT]
        self.State = Environment()
        self.isEnd = self.State.isEnd
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9
        
        # initial Q values
        self.Q_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0  # Q value is a dict of dict

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            current_position = self.State.state
            for a in self.actions:
                nxt_reward = self.Q_values[current_position][a]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
            # print("current pos: {}, greedy aciton: {}".format(self.State.state, action))
        return action

    def takeAction(self, action):
        position = self.State.nextPosition(action)
        # update State
        self.State.state=position
        return self.State

    def reset(self):
        self.states = []
        self.State = Environment()
        self.isEnd = self.State.isEnd

    def play(self, rounds=10):
        i = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.State.isEnd:
                # back propagate
                reward = self.State.giveReward()
                for a in self.actions:
                    self.Q_values[self.State.state][a] = reward
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    current_q_value = self.Q_values[s[0]][s[1]]
                    reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value)
                    self.Q_values[s[0]][s[1]] = round(reward, 3)
                self.reset()
                i += 1
            else:
                # self.State.showBoard()
                action = self.chooseAction()
                # append trace
                self.states.append([(self.State.state), action])
                print("current position {} action {}".format(self.State.state, action))
                # by taking the action, it reaches the next state
                self.State = self.takeAction(action)
                # mark is end
                self.State.isEndPosition()
                print("new state", self.State.state)
                print("---------------------")
                self.isEnd = self.State.isEnd
Exemplo n.º 6
0
    LEFT:[LEFT, UP, DOWN], \
    RIGHT:[RIGHT, UP, DOWN]}

# Init V(s)
V = np.zeros((BOARD_ROWS, BOARD_COLS), dtype='float16')

# init actions probabilities
# [prob_main_action, prob_first_alternative_action, prob_second_alternative_action]
p = [0.8, 0.1, 0.1]

delta = 1  # a value greater than EPSILON to kick off the process
while (delta > EPSILON):
    delta = 0
    for row in range(BOARD_ROWS):
        for col in range(BOARD_COLS):
            state_value_old = V[row][col]
            values = []  # stores the value of each action (for a single state)
            for action in ACTIONS:
                new_state_value = 0.0
                for prob_action, prob in zip(ACTIONS[action], p):
                    env = Environment(state=(row, col), deterministic=True)
                    next_state = env.nextPosition(prob_action)
                    print()
                    reward = env.giveReward()
                    new_state_value += prob * (
                        reward + GAMMA * V[next_state[0]][next_state[1]])
                values.append(new_state_value)
            V[row][col] = np.max(values)
            v_change = np.abs(state_value_old - np.max(values))
            delta = np.maximum(delta, v_change)
print("V = \n" + str(V))