def play_w_human(): g = Game() # agent = MinimaxAgent() #MCTSAgent() agent = QLearningAgent('q_values') turn = RED while True: g.printBoard() if turn == RED: row = input( '{}\'s turn: '.format('Red' if turn == RED else 'Yellow')) w = g.insert(int(row), turn) agent.play_opponent_move(int(row)) else: move = agent.play_move() w = g.insert(move, turn) if w: print "WINNER: ", w break turn = YELLOW if turn == RED else RED
def main(): agent = QLearningAgent() if os.path.isfile(FILE): with open(FILE, 'r') as f: agent.Q = pickle.load(f) for i in range(NUM_GAMES): print(i) agent.train() with open(FILE, 'w') as f: agent.save(f)
next_obs, reward, done, _ = env.step(action) total_reward += reward obs = next_obs # time.sleep(0.5) env.render() if done: break return total_reward # 使用gym创建迷宫环境,设置is_slippery为False降低环境难度 env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up # 创建一个agent实例,输入超参数 agent = QLearningAgent( obs_n=env.observation_space.n, act_n=env.action_space.n, learning_rate=0.1, gamma=0.9, e_greed=0.1) # 训练500个episode,打印每个episode的分数 for episode in range(500): ep_reward, ep_steps = run_episode(env, agent, True) print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, ep_reward)) # 全部训练结束,查看算法效果 test_reward = test_episode(env, agent) # 保存Q table if(test_reward == 1): agent.save()
def main(cfg): pygame.init() # フォントの作成 sysfont = pygame.font.SysFont(None, 40) screen = pygame.display.set_mode(WINDOW_SIZE) pygame.display.set_caption("Grid World") done = False clock = pygame.time.Clock() # grid worldの初期化 grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) agent = QLearningAgent( epsilon=cfg["agent"]["epsilon"], epsilon_decay_rate=cfg["agent"]["epsilon_decay_rate"], actions=np.arange(4), observation=ini_state) # Q学習エージェント nb_episode = cfg["nb_episode"] # エピソード数 save_interval = cfg["save_interval"] result_dir = cfg["result_dir"] max_step = 1 rewards = [] # 評価用報酬の保存 is_end_episode = False # エージェントがゴールしてるかどうか? step = 0 # time.sleep(30) for episode in range(nb_episode): print("episode:", episode) episode_reward = [] # 1エピソードの累積報酬 step = 0 while (is_end_episode is False and step < max_step): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_end_episode = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 episode_reward.append(reward) screen.fill(BLACK) # grid worldの描画 draw_grid_world(grid_env.map, screen) # テキストを描画したSurfaceを作成 step_str = sysfont.render("step:{}".format(step), False, WHITE) # 位# テキストを描画する screen.blit(step_str, (500, 50)) clock.tick(1) step += 1 # 再描画 pygame.display.flip() rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化 agent.observe(state) # エージェントを初期位置に is_end_episode = False print("step:", step) agents = [agent] if episode % save_interval == 0: save_result(agents, episode, result_dir) pygame.quit()
from environments.biased_rock_paper_scissors_env import BiasedRockPaperScissorsEnv from qlearning_agent import QLearningAgent from session import Session from collections import Counter env = BiasedRockPaperScissorsEnv() agent = QLearningAgent(alpha=0.1, gamma=0.9) session = Session(env, agent) logs = session.run(episodes=10000, epsilon='explore_then_exploit') print(Counter([log['state-action pairs'][0][1] for log in logs])) # the more randomness in the environment, the lower the alpha should be
# env.render() n_states = env.env.nS n_actions = env.env.nA print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with alpha = 0.5 epsilon = 0.9 epsilon_threshold = 0.01 discount = 0.99 get_legal_actions = lambda s: range(n_actions) epsilon_ratio = 0.995 ql_agent = QLearningAgent(alpha, epsilon, discount, get_legal_actions) sarsa_agent = SarsaAgent(alpha, epsilon, discount, get_legal_actions) expected_sarsa_agent = ExpectedValueSarsaAgent(alpha, epsilon, discount, get_legal_actions) plt.figure(figsize=[10, 4]) rewards_qlearning = [] rewards_sarsa = [] rewards_expected_sarsa = [] # Testing loop n = 1 r_qlearning = [] r_sarsa = [] r_expected_sarsa = [] for _ in range(n):
import random import numpy as np import matplotlib.pyplot as plt from qlearning_agent import QLearningAgent from policy import EpsGreedyQPolicy from grid_world import GridWorld if __name__ == '__main__': grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) policy = EpsGreedyQPolicy(epsilon=.01) # 方策の初期化。ここではε-greedy agent = QLearningAgent(actions=np.arange(4), observation=ini_state, policy=policy) # Q Learning エージェントの初期化 nb_episode = 100 #エピソード数 rewards = [] # 評価用報酬の保存 is_goal = False # エージェントがゴールしてるかどうか? for episode in range(nb_episode): episode_reward = [] # 1エピソードの累積報酬 while(is_goal == False): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_goal = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 episode_reward.append(reward) rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化 agent.observe(state) # エージェントを初期位置に is_goal = False # テスト(greedyアクション) agent.traning = False while(is_goal == False): # ゴールするまで続ける print("(y, x):{}".format(state))
import copy import numpy as np import matplotlib import matplotlib.pyplot as plt from qlearning_agent import QLearningAgent from grid_world import GridWorld if __name__ == '__main__': grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) agent = QLearningAgent(epsilon=.1, actions=np.arange(4), observation=ini_state) # Q学習エージェント nb_episode = 1000 #エピソード数 rewards = [] # 評価用報酬の保存 is_end_episode = False # エージェントがゴールしてるかどうか? for episode in range(nb_episode): episode_reward = [] # 1エピソードの累積報酬 while (is_end_episode == False): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_end_episode = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 episode_reward.append(reward) rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化 agent.observe(state) # エージェントを初期位置に is_end_episode = False # 結果のプロット plt.plot(np.arange(nb_episode), rewards)
from grid_world import GridWorld # 定数 NB_EPISODE = 100 # エピソード数 EPSILON = .1 # 探索率 ALPHA = .1 # 学習率 GAMMA = .90 # 割引率 ACTIONS = np.arange(4) # 行動の集合 if __name__ == '__main__': grid_env = GridWorld() # grid worldの環境の初期化 ini_state = grid_env.start_pos # 初期状態(エージェントのスタート地点の位置) # エージェントの初期化 agent = QLearningAgent( alpha=ALPHA, gamma=GAMMA, epsilon=EPSILON, # 探索率 actions=ACTIONS, # 行動の集合 observation=ini_state) # Q学習エージェント rewards = [] # 評価用報酬の保存 is_end_episode = False # エージェントがゴールしてるかどうか? # 実験 for episode in range(NB_EPISODE): episode_reward = [] # 1エピソードの累積報酬 while (is_end_episode == False): # ゴールするまで続ける action = agent.act() # 行動選択 state, reward, is_end_episode = grid_env.step(action) agent.observe(state, reward) # 状態と報酬の観測 episode_reward.append(reward) rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える state = grid_env.reset() # 初期化
a1 = copy.deepcopy(agent1) a2 = copy.deepcopy(agent2) if i < n_trials/2: winner, count_moves, trial_times = play_wo_human(a1, a2, 1) else: winner, count_moves, trial_times = play_wo_human(a1, a2, 2) if winner: results[winner].append(count_moves) times[a1.name].append(trial_times[a1.name]) times[a2.name].append(trial_times[a2.name]) total_moves = sum(results[a1.name]) + sum(results[a2.name]) print results print "TOTAL GAMES WON BY ", a1.name, ": ", len(results[a1.name]) if len(results[a1.name]) != 0: print "AVERAGE NO. MOVES: ", sum(results[a1.name]) / len(results[a1.name]) print "AVERAGE TIME PER MOVE: ", sum(times[a1.name]) / total_moves print "TOTAL GAMES WON BY ", a2.name, ": ", len(results[a2.name]) if len(results[a2.name]) != 0: print "AVERAGE NO. MOVES: ", sum(results[a2.name])/ len(results[a2.name]) print "AVERAGE TIME PER MOVE: ", sum(times[a2.name]) / total_moves print "################################################################" if __name__ == "__main__": # test_agents(NaiveAgent(), MCTSAgent()) # test_agents(NaiveAgent(), QLearningAgent()) # test_agents(MCTSAgent(), MinimaxAgent(depth=3)) # test_agents(QLearningAgent("q_values"), MinimaxAgent(depth=3)) test_agents(MCTSAgent(), QLearningAgent("q_values"))