def play_matches(player1, player2, games): game = board.GoBoard() currentGameCount = 0 scores = {"player1": 0, "player2": 0} switchTurn = {-1: 1, 1: -1} main_memory = Memory() while currentGameCount < games: game = board.GoBoard() rand = random.randint(0, 1) * 2 - 1 if rand == -1: playerTurn = -1 players = { 1: { "agent": player1, "name": "Player 1" }, -1: { "agent": player2, "name": "Player 2" } } else: playerTurn = 1 players = { 1: { "agent": player2, "name": "Player 2" }, -1: { "agent": player1, "name": "Player1" } } game_running = 0 turn = 0 while game_running == 0: memory_1 = Memory() memory_2 = Memory() turn += 1 # Make an action #action ,pi, mcts_values, nn_value = players[playerTurn]["agent"].mcts.act() new_state, value, preds = players[playerTurn]["agent"].act() print(playerTurn) print(players[playerTurn]) print("Value: " + str(value)) print(new_state.state) if playerTurn == -1: memory_1.add_to_memory(new_state.state) else: memory_2.add_to_memory(new_state.state) # Switch the current player playerTurn = switchTurn[playerTurn] # Update the new current players game state players[playerTurn]["agent"].oppAct(new_state) if new_state.state.is_game_over(): print("Game " + str(currentGameCount) + " is over!") winner = new_state.state.get_winner() print(winner + " Won!") if winner == "Player 1": memory_1.declare_win_or_loss(1) memory_2.declare_win_or_loss(-1) else: memory_2.declare_win_or_loss(1) memory_1.declare_win_or_loss(-1) main_memory.join_memories(memory_1) main_memory.join_memories(memory_2) game_running = 1 currentGameCount += 1
env = gym.make("CartPole-v0") env.seed(1) n_actions = env.action_space.n cartpole_model = create_cartpole_model() memory = Memory() learning_rate = 1e-3 optimizer = tf.train.AdamOptimizer(learning_rate) smoothed_reward = util.LossHistory(smoothing_factor=0.9) plotter = util.PeriodicPlotter(sec=5, xlabel='Iterations', ylabel='Rewards') for i_episode in range(10000): plotter.plot(smoothed_reward.get()) observation = env.reset() while True: action = choose_action(cartpole_model, observation) next_obs, reward, done, info = env.step(action) memory.add_to_memory(observation, action, reward) if done: total_reward = sum(memory.rewards) smoothed_reward.append(total_reward) train_step(cartpole_model, optimizer, observations=np.vstack(memory.observations), actions=np.array(memory.actions), discounted_rewards=discount_rewards(memory.rewards)) memory.clear() break observation = next_obs save_video_of_model(cartpole_model, "CartPole-v0")