def getNextState(self, game_state, player, action): # def getCanonicalState(self, game_state, action): # if player takes action on board, return next (board,player) # action must be a valid move # if action == self.n*self.n: # return (board, -player) # b = Board(self.n) # b.pieces = np.copy(board) # move = (int(action/self.n), action%self.n) # b.execute_move(move, player) if isinstance(action, tuple) or isinstance(action, list) or isinstance( action, np.ndarray): assert 0 <= action[0] < self.size assert 0 <= action[1] < self.size action = self.size * action[0] + action[1] elif action is None: action = self.size**2 next_state = gogame.next_state(game_state, action, canonical=False) if next_state[govars.TURN_CHNL][0][0]: p = 1 else: p = -1 return next_state, p
def step(self, action): ''' Assumes the correct player is making a move. Black goes first. return observation, reward, done, info ''' assert not self.done if isinstance(action, tuple) or isinstance(action, list) or isinstance( action, np.ndarray): assert 0 <= action[0] < self.size assert 0 <= action[1] < self.size action = self.size * action[0] + action[1] elif action is None: action = self.size**2 self.state_ = gogame.next_state(self.state_, action, canonical=False) self.done = gogame.game_ended(self.state_) return np.copy(self.state_), self.reward(), self.done, self.info()
def run(self, model, state, to_play): root = Node(0, to_play) # EXPAND root action_probs, value = model.predict(state) valid_moves = gogame.valid_moves(state) action_probs = action_probs * valid_moves # mask invalid moves action_probs /= np.sum(action_probs) root.expand(state, to_play, action_probs) for t in range(self.args['num_simulations']): # print("num_simulations: {}/{}".format(t, self.args['num_simulations'])) node = root search_path = [node] # SELECT while node.expanded(): action, node = node.select_child() search_path.append(node) parent = search_path[-2] state = parent.state # Now we're at a leaf node and we would like to expand # Players always play from their own perspective next_state = gogame.next_state(state, action, canonical=True) # The value of the new state from the perspective of the other player value = gogame.winning(next_state) if gogame.game_ended( next_state) else None # if gogame.game_ended(next_state): # print("end") if value is None: # If the game has not ended: # EXPAND action_probs, value = model.predict(next_state) valid_moves = gogame.valid_moves(next_state) action_probs = action_probs * valid_moves # mask invalid moves action_probs /= np.sum(action_probs) node.expand(next_state, parent.to_play * -1, action_probs) self.backpropagate(search_path, value, parent.to_play * -1) return root
def exceute_episode(self): train_examples = [] current_player = 1 state = gogame.init_state(self.args['boardSize']) while True: #print("while True") canonical_board = gogame.canonical_form(state) self.mcts = MCTS(self.game, self.model, self.args) root = self.mcts.run(self.model, canonical_board, to_play=1) action_probs = [ 0 for _ in range((self.args['boardSize'] * self.args['boardSize']) + 1) ] for k, v in root.children.items(): action_probs[k] = v.visit_count action_probs = action_probs / np.sum(action_probs) train_examples.append( (canonical_board, current_player, action_probs)) action = root.select_action(temperature=1) state = gogame.next_state(state, action, canonical=False) current_player = -current_player reward = gogame.winning( state) * current_player if gogame.game_ended(state) else None if reward is not None: ret = [] for hist_state, hist_current_player, hist_action_probs in train_examples: # [Board, currentPlayer, actionProbabilities, Reward] tfBoard = np.array( [hist_state[0], hist_state[1], hist_state[3]]).transpose().tolist() #ret.append(np.array([tfBoard,tfBoard, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))])) ret.append( (tfBoard, hist_action_probs, reward * ((-1)**(hist_current_player != current_player)))) return ret
def test_batch_canonical_form(self): states = gogame.batch_init_state(2, 7) states[0] = gogame.next_state(states[0], 0) self.assertEqual(states[0, govars.BLACK].sum(), 1) self.assertEqual(states[0, govars.WHITE].sum(), 0) states = gogame.batch_canonical_form(states) self.assertEqual(states[0, govars.BLACK].sum(), 0) self.assertEqual(states[0, govars.WHITE].sum(), 1) self.assertEqual(states[1, govars.BLACK].sum(), 0) self.assertEqual(states[1, govars.WHITE].sum(), 0) for i in range(2): self.assertEqual(gogame.turn(states[i]), govars.BLACK) canon_again = gogame.batch_canonical_form(states) self.assertTrue((canon_again == states).all())
def step(self, action): ''' Assumes the correct player is making a move. Black goes first. return observation, reward, done, info ''' assert not self.done if isinstance(action, tuple) or isinstance(action, list) or isinstance( action, np.ndarray): assert 0 <= action[0] < self.size assert 0 <= action[1] < self.size action = self.size * action[0] + action[1] elif action is None: action = self.size**2 self.state_ = gogame.next_state(self.state_, action, canonical=False) self.done = gogame.game_ended(self.state_) # assert self.past_states_with_player[:14].shape == (14, 19, 19) # assert self.turn() in {0, 1} and self.turn() ^ 1 in {0, 1} and \ # self.turn() != self.turn() ^ 1 """ Past states gives the previous 8 board states and the current player We stack together: 1. the current player's stones board (1 to each stone, 0 everything else) (X_t) (NOT the player to play), 2. the opponent's stones board (Y_t) 3. and the previous 7 timesteps (X_{t-1}, Y_{t-1}, ... X_{t-7}, Y_{t-7}). 4. The player to play, C (19x19 array of 1s for black and 0s for white) """ self.past_states_with_player = np.concatenate( (self.state_[self.turn() ^ 1].reshape( (1, 19, 19)), self.state_[self.turn()].reshape((1, 19, 19)), self.past_states_with_player[:14], self.state_[2].reshape( (1, 19, 19)) ^ 1), axis=0) return (np.copy(self.state_), np.copy(self.past_states_with_player)), \ self.reward(), self.done, self.info()