def act(self, gs: GameState) -> int: if self.apprentice_training_count > self.apprentice_training_before_takeover: return gs.get_available_actions(gs.get_active_player())[np.argmax( self.brain.predict(np.array([ gs.get_vectorized_state() ]))[0][gs.get_available_actions(gs.get_active_player())])] root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, root_hash, gs.get_available_actions(gs.get_active_player()), gs.get_active_player()) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all( (edge['n'] > 0 for edge in memory[s])): chosen_edge = max(((edge, ExpertApprenticeAgent.ucb_1(edge)) for edge in memory[s]), key=lambda kv: kv[1])[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player()) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice( list( filter(lambda e: e['n'] == 0, (edge for edge in memory[s])))) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player()) # SIMULATION while not gs_copy.is_game_over(): gs_copy.step( gs_copy.get_active_player(), choice( gs_copy.get_available_actions( gs_copy.get_active_player()))) scores = gs_copy.get_scores() # REMONTEE DU SCORE for (s, edge) in history: edge['n'] += 1 edge['r'] += scores[edge['p']] for neighbour_edge in memory[s]: neighbour_edge['np'] += 1 target = np.zeros(gs.get_action_space_size()) for edge in memory[root_hash]: target[edge['a']] = edge['n'] target /= np.sum(target) self.states_buffer.append(gs.get_vectorized_state()) self.actions_buffer.append(target) if len(self.states_buffer) > 200: self.apprentice_training_count += 1 self.brain.fit(np.array(self.states_buffer), np.array(self.actions_buffer)) self.states_buffer.clear() self.actions_buffer.clear() if self.apprentice_training_count > self.apprentice_training_before_takeover: print('Apprentice is playing next round') return max((edge for edge in memory[root_hash]), key=lambda e: e['n'])['a']
def act(self, gs: GameState) -> int: root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: q_values = self.brain.predict(gs.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, root_hash, gs.get_available_actions(gs.get_active_player()), gs.get_active_player(), q_values) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all( (edge['n'] > 0 for edge in memory[s])): chosen_edge = max(((edge, HalfAlphaZeroAgent.ucb_1(edge)) for edge in memory[s]), key=lambda kv: kv[1])[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: q_values = self.brain.predict( gs_copy.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), q_values) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice( list( filter(lambda e: e['n'] == 0, (edge for edge in memory[s])))) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: q_values = self.brain.predict( gs_copy.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), q_values) scores = np.zeros(gs_copy.player_count()) scores_set = np.zeros(gs_copy.player_count()) # REMONTEE DU SCORE for (s, edge) in history: if scores_set[edge['p']] == 0: scores_set[edge['p']] = 1.0 scores[edge['p']] = edge['q'] edge['n'] += 1 edge['r'] += scores[edge['p']] for neighbour_edge in memory[s]: neighbour_edge['np'] += 1 chosen_action = max((edge for edge in memory[root_hash]), key=lambda e: e['n'])['a'] if len(self.states_buffer) > 0: self.rewards_buffer.append(self.intermediate_reward) self.states_buffer.append(gs.get_vectorized_state()) self.actions_buffer.append( to_categorical(chosen_action, gs.get_action_space_size())) self.intermediate_reward = 0.0 return chosen_action