def generate_session(policy, opponent, cuda=False, t_max=100): """ Play game until end or for t_max rounds. returns: list of states, list of actions and sum of rewards """ states, actions = [], [] total_reward = 0. b = Board() # Decide if we are player 1 or 2 # player = np.random.choice((Board.PLAYER_1, Board.PLAYER_2), 1) player = Board.PLAYER_1 if player == Board.PLAYER_2: # We are player two, let player one play first a = select_action(policy, b, cuda) b = b.insert(a.data[0][0]) for t in range(t_max): # We move states.append(b) a = select_action(policy, b, cuda) actions.append(a) b = b.insert(a.data[0][0]) winner = b.winner() if winner: if winner == player: total_reward = REWARD_WIN elif winner == '-': total_reward = REWARD_UNDECIDED else: print("Invalid result") break # Other player moves b = opponent(policy, b) winner = b.winner() if winner: if winner == '-': total_reward = REWARD_UNDECIDED elif winner != player: total_reward = REWARD_LOOSE else: print("Invalid result") break return states, actions, total_reward
def select_action(policy, board: Board, cuda=False, noise=0): # Get probabilities from neural network state = torch.from_numpy(board.matrix().reshape(BOARD_ROWS * BOARD_COLS)).float().unsqueeze(0) if cuda: state = state.cuda() probs = policy(Variable(state)) # Exclude any results that are not allowed mult_np = np.zeros(len(POSSIBLE_ACTIONS), dtype=np.float32) allowed_actions = board.valid_actions() for i in POSSIBLE_ACTIONS: if i in allowed_actions: mult_np[i] = 1 # Always choose winning move for a in allowed_actions: hypothetical_board = board.insert(a) if hypothetical_board.winner() == board.turn(): mult_np = np.zeros(len(POSSIBLE_ACTIONS), dtype=np.float32) mult_np[a] = 1 mult = Variable(torch.from_numpy(mult_np)) noise = Variable(torch.from_numpy(mult_np * noise)) if cuda: mult = mult.cuda() noise = noise.cuda() probs = probs * mult + noise if torch.sum(probs * mult).data[0] < 1e-40: # Neural network only offered things that are not allowed, so we go for random probs = probs + mult return probs.multinomial()
def test_insert_some_coins(): b = Board() assert b.turn() == 'O' b = b.insert(3) assert b.turn() == 'X' assert b == Board([0, 0, 0, 0b01, 0, 0, 0]) assert b.valid_actions() == (0, 1, 2, 3, 4, 5, 6) b = b.insert(2) assert b.turn() == 'O' assert b == Board([0, 0, 0b10, 0b01, 0, 0, 0]) assert b.valid_actions() == (0, 1, 2, 3, 4, 5, 6) b = b.insert(2) assert b.turn() == 'X' assert b == Board([0, 0, 0b0110, 0b01, 0, 0, 0]) assert b.valid_actions() == (0, 1, 2, 3, 4, 5, 6) b = b.insert(2) assert b.turn() == 'O' assert b == Board([0, 0, 0b100110, 0b01, 0, 0, 0]) assert b.valid_actions() == (0, 1, 2, 3, 4, 5, 6) b = b.insert(2) assert b.turn() == 'X' assert b == Board([0, 0, 0b01100110, 0b01, 0, 0, 0]) assert b.valid_actions() == (0, 1, 2, 3, 4, 5, 6) b = b.insert(2) assert b.turn() == 'O' assert b == Board([0, 0, 0b1001100110, 0b01, 0, 0, 0]) assert b.valid_actions() == (0, 1, 2, 3, 4, 5, 6) b = b.insert(2) assert b.turn() == 'X' assert b == Board([0, 0, 0b011001100110, 0b01, 0, 0, 0]) assert b.valid_actions() == (0, 1, 3, 4, 5, 6)
click.echo(click.style(str(e), fg='red')) b = Board() # Decide if computer is player 1 or 2 computer_player = Board.PLAYER_1 if computer_player == Board.PLAYER_2: # Computer is player two, let player one play first b = do_human_action(b) while True: # Computer moves a = select_action(policy, b) b = b.insert(a.data[0][0]) winner = b.winner() if winner: print_board(b) if winner == computer_player: click.echo(click.style('Computer wins!', fg='green')) elif winner == '-': click.echo(click.style('Nobody wins!', fg='red')) else: print("Invalid result") sys.exit(0) # Other player moves b = do_human_action(b)
def test_insert_coins_full(): b = Board([0, 0, 0b011001100110, 0b01, 0, 0, 0]) with pytest.raises(ValueError): b.insert(2)