def update_policy(self, transitions, Q, policy, epsilon): # print("PREVIOUS POLICY: ", policy) self.file.write("PREVIOUS POLICY: \n") for state in policy.keys(): self.file.write(state + "\n") pByAction = policy[state] self.file.write("p(a): " + str(pByAction) + "\n") for transition in transitions: state = to_board_state(transition['from_state']) # print("UPDATE POLICY: FROM STATE") # print(game.to_display_string(transition['from_state'])) self.file.write(game.to_display_string(transition['from_state'])) astar = self.find_max_action(Q, state) # print("UPDATE POLICY: ASTAR ==> ", astar) self.file.write("UPDATE POLICY: ASTAR ==> " + str(astar) + "\n") possible_actions = get_actions(policy, state) # print("POSSIBLE ACTIONS: ", possible_actions) self.file.write("POSSIBLE ACTIONS: " + str(possible_actions) + "\n") for action in possible_actions: if action == astar: policy[state][action] = 1.0 - epsilon + epsilon / len(possible_actions) else: policy[state][action] = epsilon / len(possible_actions) # print("UPDATED POLICY ==> ", policy) self.file.write("UPDATED POLICY: " + str(policy) + "\n") return policy
def model_environment(opponent, state, action): game_complete = False initial_board = state file.write("AGENT MAKING MOVE: " + str(action) + str(board.to_state(action)) + "\n") current_board = p.add_move('X',action,initial_board) print("AFTER AGENT MOVE:") print(game.to_display_string(current_board)) file.write("AFTER AGENT MOVE:\n") file.write(game.to_display_string(current_board)) reward = 0.0 if p.is_winner(current_board,'X'): game_complete = True reward = 1.0 elif p.is_cat_game(current_board): game_complete = True reward = 0.0 if not game_complete: # let the opponent make a move ... (opponent_id, opponent_move) = opponent.pick_next_move(current_board) current_board = p.add_move(opponent_id, opponent_move, current_board) print("AFTER OPPONENT MOVE") print(game.to_display_string(current_board)) file.write("AFTER OPPONENT MOVE\n") file.write(game.to_display_string(current_board)) if p.is_winner(current_board,opponent_id): game_complete = True reward = -1.0 elif p.is_cat_game(current_board): game_complete = True reward = 0 return current_board, reward, game_complete
def generate_tic_tac_toe_episode(policy): current_board = p.empty_board() opponent = GreedyRandomPlayer('O') game_complete = False transitions = [] while not game_complete: previous_state = current_board print("PRIOR TO MOVE:") print(game.to_display_string(current_board)) file.write("PRIOR TO MOVE:\n") file.write(game.to_display_string(current_board)) selectedAction = sample_tic_tac_toe_policy(current_board, policy) # model the environment --> returns a next state and a reward next_state, reward, game_complete = model_environment(opponent, current_board, selectedAction) # append the episode transition = {} transition['from_state'] = current_board transition['to_state'] = next_state transition['action'] = selectedAction transition['reward'] = reward transitions.append(transition) file.write("ADDING TRANSITION: " + str(transition) + "\n") current_board = next_state # now figure out the reward print("BOARD AT END OF EPISODE") print(game.to_display_string(current_board)) file.write("BOARD AT END OF EPISODE\n") file.write(game.to_display_string(current_board)) return transitions
def execute(self, initial_board): current_board = initial_board while not self.environment.is_completed(): player_id, player_move = self.agent.pick_next_move(current_board) action = {} action['player'] = player_id action['cell'] = player_move updated_board, reward = self.environment.update( current_board, action) current_board = updated_board print("EPISODE TERMINATED --> OUTCOME: ", self.environment.get_outcome()) print(game.to_display_string(current_board)) return self.agent, current_board