def main(): NUM_OF_GAMES = 100 ai = RandomAgent() s = Sim() all_records = [] for i in tqdm(range(NUM_OF_GAMES)): records = [] # preparation of simulator s.reset_s() # 引数のなしの時は何も置かれていない状態となる while True: # ban を上書きする前に,勝敗を保存する? reshape_self, reshape_opp, reshape_ban, kou = s.get_s() if reshape_ban != 2: act_num = ai.act(reshape_self, reshape_opp, reshape_ban, kou) s.act(act_num) else: # 黒の勝ち+1,負け-1 outcome = 2 * is_black_win(s) - 1 break records.append( [reshape_self[1:], reshape_opp[1:], reshape_ban, act_num]) #outcomeを付け足し for j in range(len(records)): records[j].append((1. - 2. * (j % 2)) * outcome) all_records.append(records) # stackしたレコードを書き出す w_fnc(all_records, 'all_records.csv') rec = [records[0][-1] for records in all_records] kuro_win = NUM_OF_GAMES / 2 + sum(rec) / 2 siro_win = NUM_OF_GAMES - kuro_win print('黒の勝ち:{0}回, 白の勝ち:{1}回'.format(kuro_win, siro_win))
def main(): # preparation of pygame pygame.init() # 初期化 AIis = 2 #1が黒,2が白 ai = RandomAgent() bp = [115, 15] g = 40 mar = 20 x, y = 0, 0 pixels = [bp, g, mar] screen = pygame.display.set_mode((600, 400)) # ウィンドウサイズの指定 pygame.display.set_caption("GoSimulator") # ウィンドウの上の方に出てくるアレの指定 sysfont = pygame.font.SysFont(None, 40) # preparation of simulator s = Sim() s.reset_s() # 引数のなしの時は何も置かれていない状態となる while True: screen.fill(( 0, 100, 0, )) # 背景色の指定。 # state,ban = s.get_s() # gui用に整形をかませる reshape_self, reshape_opp, reshape_ban, kou = s.get_s() ban = 2 - int(reshape_ban) # game over すなわち ban == 0の時の処理 if ban != 0: state = ban * reshape_self + (3. - ban) * reshape_opp bl, wh = s.get_eval() # TODO for debug draw(pygame, screen, sysfont, pixels, state, ban, x, y, bl, wh) pygame.display.update() # 画面更新 if ban == AIis: s.act(ai.act(reshape_self, reshape_opp, reshape_ban, kou)) else: for event in pygame.event.get(): # 終了処理 if event.type == QUIT: pygame.quit() sys.exit() if event.type == MOUSEBUTTONDOWN and event.button == 1: x, y = event.pos num = convert_to_num(pixels, x, y) if num == -1: s.reset_s() else: if num in s.regal_acts() and ban != 0: s.act(num) bl, wh = s.get_eval() if len(bl) != len(set(bl)): print('[error] get_eval has over lap')
def train_chaser(env_name, chaser_need_restore, episode_count, close=False): """ Use Actor-Critic TD(0) method to train runner. Cannot use batch method to optimize model. """ env = gym.make(env_name) features_n = 4 chaser = ActorCriticAgent(300, 300, 30, color=(1.0, 0.0, 0.0), agent_type=AgentType.Chaser, features_n=features_n, discounted_value=0.9, learning_rate=1e-6, need_restore=chaser_need_restore) runner = RandomAgent(500, 500, 30, color=(0.0, 1.0, 0.0), agent_type=AgentType.Runner, env=env) env.add_agent(chaser) env.add_agent(runner) total_steps = 0 losses = [] for epi in range(episode_count): chaser_state = env.reset() step = 0 while True: env.render(close=close) action = chaser.act(chaser_state) chaser_state_, reward, done, _ = env.step(chaser.type, action) chaser.memory.append( (chaser_state, action, chaser_state_, reward, done)) chaser_state = chaser_state_ step += 1 total_steps += 1 if step >= 500: # When episode's steps above 500, break up this episode chaser.memory.clear() break if done: loss = chaser.optimize_model() print('Episode: %d\tsteps: %d\tloss: %f' % (epi + 1, step + 1, loss)) losses.append(loss) break else: runner_action = runner.act(chaser_state) _, _, done, _ = env.step(runner.type, runner_action) env.render(close=close) step += 1 total_steps += 1 if done: chaser.memory.clear() print('Episode: %d\tsteps: %d' % (epi + 1, step + 1)) break # 每个回合结束保存模型 if epi % 100 == 0: chaser.save()
if __name__ == "__main__": sample_batch_size = 32 observation_shape = env.observation_space.shape action_shape = env.action_space.n first_episode = True episode_rewards = list() steps = 0 PRE_TRAINLENGTH = 3000 ## Random Agent ragent = RandomAgent(action_shape) obs = env.reset() for step in range(PRE_TRAINLENGTH): #env.render() action = ragent.act(obs) next_obs, reward, done, info = env.step(action) next_obs = np.reshape(next_obs, [1, observation_shape[0]]) ragent.observe((obs, action, reward, next_obs, done)) obs = next_obs agent = Shallow_Q_Learner(observation_shape, action_shape, memory=ragent.get_memory()) for episode in range(MAX_NUM_EPISODES): obs = env.reset() obs = np.reshape(obs, [1, observation_shape[0]]) cum_reward = 0.0 # Cumulative reward for step in range(MAX_STEPS_PER_EPISODE):
env.seed(38) return env # test env: if __name__ == '__main__': env = env_fn() obs = env.reset() env.render() agent = RandomAgent(env.action_space) reward = 0 done = False n_steps = 20 for step in range(n_steps): print("Step {}".format(step + 1)) action = agent.act(obs, reward, done) obs, reward, done, info = env.step(action) print('action=', action, 'obs=', obs, 'reward=', reward, 'done=', done) env.render() if done: print("Goal reached!", "reward=", reward) break env.close()
game_reward_p2 = 0 # p1 starts. turn = 1 state = game.get_state() while not is_terminal: if turn == 1: action = p1.act(state) states_actions_p1.append((deepcopy(state), action)) state, game_reward_p1, is_terminal = \ game.step(player=1, action=action, display=False) game_reward_p2 = game_reward_p1 * -1 turn = 2 elif turn == 2: action = p2.act(state) # states_actions_p2.append((deepcopy(state), action)) state, game_reward_p2, is_terminal = \ game.step(player=2, action=action, display=False) game_reward_p1 = game_reward_p2 * -1 turn = 1 p1.learn(states_actions_p1, game_reward_p1) #p2.learn(states_actions_p2, game_reward_p2) if game_reward_p1 == 1: win_p1 += 1 elif game_reward_p1 == 0: draw += 1 if game_num % BATCH_SIZE == 0: