def policy_evaluation(tot_policy, GAMMA, EPSILON): delta = 1 while(delta > EPSILON): delta = 0 for row in range(BOARD_ROWS): for col in range(BOARD_COLS): state_value_old = V[row][col] for action in ACTIONS: proba_act=tot_policy[grid2state_dict[(row,col)]][action] new_state_value = 0.0 for choix, prob in zip(ACTIONS[action], p): env = Environment(state = (row, col), deterministic=True) next_state = env.nextPosition(choix) #print("state = ({}, {})", row, col) #print("action = " + choix) #print("next-state = " + str(next_state)) #print() reward = env.giveReward() new_state_value += proba_act*prob * (reward + GAMMA * V[next_state[0]][next_state[1]]) V[row][col] = new_state_value v_change = np.abs(state_value_old - new_state_value) delta = np.maximum(delta, v_change) print("\nDelta = " + str(delta) + "\n") print("V = " + str(V)) return(V)
def q_from_v (V, state, GAMMA): q = {str(act):0 for act in ACTIONS} for action in q: for choix, prob in zip(ACTIONS[action], p): env = Environment(state = state, deterministic=True) next_state = env.nextPosition(choix) reward = env.giveReward() q[action] += prob * (reward + GAMMA * V[next_state[0]][next_state[1]]) return q
def __init__(self): self.states = [] self.actions = [UP, DOWN, LEFT, RIGHT] self.State = Environment() self.isEnd = self.State.isEnd self.lr = 0.2 self.exp_rate = 0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict
def reset(self): self.states = [] self.State = Environment() self.isEnd = self.State.isEnd
class Agent: def __init__(self): self.states = [] self.actions = [UP, DOWN, LEFT, RIGHT] self.State = Environment() self.isEnd = self.State.isEnd self.lr = 0.2 self.exp_rate = 0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict def chooseAction(self): # choose action with most expected value mx_nxt_reward = 0 action = "" if np.random.uniform(0, 1) <= self.exp_rate: action = np.random.choice(self.actions) else: # greedy action current_position = self.State.state for a in self.actions: nxt_reward = self.Q_values[current_position][a] if nxt_reward >= mx_nxt_reward: action = a mx_nxt_reward = nxt_reward # print("current pos: {}, greedy aciton: {}".format(self.State.state, action)) return action def takeAction(self, action): position = self.State.nextPosition(action) # update State self.State.state=position return self.State def reset(self): self.states = [] self.State = Environment() self.isEnd = self.State.isEnd def play(self, rounds=10): i = 0 while i < rounds: # to the end of game back propagate reward if self.State.isEnd: # back propagate reward = self.State.giveReward() for a in self.actions: self.Q_values[self.State.state][a] = reward print("Game End Reward", reward) for s in reversed(self.states): current_q_value = self.Q_values[s[0]][s[1]] reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value) self.Q_values[s[0]][s[1]] = round(reward, 3) self.reset() i += 1 else: # self.State.showBoard() action = self.chooseAction() # append trace self.states.append([(self.State.state), action]) print("current position {} action {}".format(self.State.state, action)) # by taking the action, it reaches the next state self.State = self.takeAction(action) # mark is end self.State.isEndPosition() print("new state", self.State.state) print("---------------------") self.isEnd = self.State.isEnd
LEFT:[LEFT, UP, DOWN], \ RIGHT:[RIGHT, UP, DOWN]} # Init V(s) V = np.zeros((BOARD_ROWS, BOARD_COLS), dtype='float16') # init actions probabilities # [prob_main_action, prob_first_alternative_action, prob_second_alternative_action] p = [0.8, 0.1, 0.1] delta = 1 # a value greater than EPSILON to kick off the process while (delta > EPSILON): delta = 0 for row in range(BOARD_ROWS): for col in range(BOARD_COLS): state_value_old = V[row][col] values = [] # stores the value of each action (for a single state) for action in ACTIONS: new_state_value = 0.0 for prob_action, prob in zip(ACTIONS[action], p): env = Environment(state=(row, col), deterministic=True) next_state = env.nextPosition(prob_action) print() reward = env.giveReward() new_state_value += prob * ( reward + GAMMA * V[next_state[0]][next_state[1]]) values.append(new_state_value) V[row][col] = np.max(values) v_change = np.abs(state_value_old - np.max(values)) delta = np.maximum(delta, v_change) print("V = \n" + str(V))