def reset(self): """ Resets the Environment. """ self.game = Game(verbose=self.verbose) self._reset_all_states() self._reset_action_buffer() self._reset_rewards() self.done = False state = self.state rewards = self.rewards done = self.done active_player = self.game.active_player return state, rewards, done, active_player
def __init__(self, board_width=8, board_height=8, n_in_row=5, init_modle=None): # 初始化棋盘和游戏服务器 self.board_width = board_width self.board_height = board_height self.n_in_row = n_in_row self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) self.loss_to_show = -1 self.entropy_to_show = -1 # 初始化训练所用的参数 self.learning_rate = 2e-3 self.lr_multiplier = 1.0 # 根据KL散度自动调整学习速率 self.temp = 1.0 self.n_playout = 400 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # 每次取batch_size进行梯度下降 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 # 一个间隔,取模为0时存储模型到硬盘 self.game_batch_num = 10000 # 最多进行10000局游戏 self.best_win_ratio = 0.0 # agent的对手,纯粹的mcts算法产生的棋手 self.pure_mcts_playout_num = 1000 # 是否加载原先已经存在的训练数据 if init_modle: self.policy_value_net = PolicyValueNet( board_width=self.board_width, board_height=self.board_height, model_file=init_modle) else: self.policy_value_net = PolicyValueNet( board_width=self.board_width, board_height=self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) self.logger = self.init_log()
def play_dumb_game(max_steps=1000, verbose=1): """ This function plays a Tichu game with four "dumb" players. Each player iterates over all available combinations and tries to beat opponents. New stacks are played with a random combination. """ game = Game(verbose=verbose) step_cnt = 0 game_active = True while game_active: active_player = game.active_player leading_player = game.leading_player # pass if player has already finsihed if game.players[active_player].has_finished(): suc, _ = game.step(active_player, Cards([])) # make a random move if stack is empty elif not (game.stack.cards): comb = game.players[active_player].random_move() suc, _ = game.step(active_player, comb) # try to make a matching move if opponent is leading elif ((active_player + leading_player) % 2) != 0: leading_type = game.stack.type leading_idx = COMB_TYPES[leading_type] avail_comb = game.players[ active_player].hand.get_available_combinations() # try to play, starting with lowest combination suc = False if avail_comb[leading_idx]: for i in range(len(avail_comb[leading_idx])): suc, _ = game.step(active_player, avail_comb[leading_idx][i]) if suc: break # Try to bomb if no combination exists if not (suc) and avail_comb[COMB_TYPES['four_bomb']]: suc, _ = game.step(active_player, avail_comb[COMB_TYPES['four_bomb']][0]) elif not (suc) and avail_comb[COMB_TYPES['straight_bomb']]: suc, _ = game.step(active_player, avail_comb[COMB_TYPES['straight_bomb']][0]) # pass if nothing works elif not (suc): suc, _ = game.step(active_player, Cards([])) # pass if teammate is leading player else: suc, _ = game.step(active_player, Cards([])) # stop if game is finished (or counter overflow) step_cnt += 1 if game.game_finished or step_cnt >= max_steps: game_active = False if step_cnt >= max_steps and verbose > 1: raise Exception( "Max. steps exceeded. Possible infinity loop detected.") break
def __init__(self, init_model=None): """ init function for the class""" # params of the board and the game self.board_width = 6 # board width self.board_height = 6 # board height self.n_in_row = 4 # win by n in line (vertically, horizontally, diagonally) self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 # a number in (0, inf) controlling the relative impact of value Q, and prior probability P, on this node's score. self.buffer_size = 10000 # buffer size for replaying experience self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) # buffer self.play_batch_size = 1 # size of rollout for each episode self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # target of KL loss self.check_freq = 50 # frequency for check evaluation and save model self.game_batch_num = 1500 # number of training game loop self.best_win_ratio = 0.0 # best evaluated win ratio # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # load from existing file # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(model_file, width=8, height=8, n=5): n = n width = width height = height try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: try: policy_param = pickle.load( open(model_file, 'rb'), encoding='bytes') # To support python3 except: pass best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=2000) # set larger n_playout for better performance mcts_player2 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = HumanPlayer() # set start_player=0 for human first game.start_play(human, mcts_player2, start_player=0, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def main(): game = Game(board_size=3) net = NNWrapper(game) # net.net = torch.load(os.path.join('./models/', '{}.pt'.format('current'))) # human_play = HumanPlay(game, net) # human_play.play() train = Train(game, net) train.train()
def human_play(n, width, height, ai_type, is_humanMoveFirst=True): # n = 5 # width, height = 8, 8 # model_file = 'best_policy_8_8_5.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy # try: # policy_param = pickle.load(open(model_file, 'rb')) # except: # policy_param = pickle.load(open(model_file, 'rb'), # encoding='bytes') # To support python3 # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) if ai_type == "pure_mcts": mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first start_player=0 if is_humanMoveFirst else 1 game.start_play(human, mcts_player, start_player=start_player, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
class GameEnv(object): def __init__(self, level='env/level.csv'): self.game = Game(level) self.repeat_frame_skip = 4 def reset(self): self.game.reset() state = self.game.state() self.agent_coord = state['coord'] return state def step(self, action): for _ in range(self.repeat_frame_skip): self.game.step(action) state = self.game.state() dead = state['dead'] goal = state['goal'] coord = state['coord'] reward = -1 + (coord[0] - self.agent_coord[0]) + 100 * goal - 100 * dead done = dead or goal self.agent_coord = coord return state, reward, done, { 'goal': goal, 'dead': dead, 'distance': self.agent_coord[0] } def render(self, mode='rgb_array'): pixels = self.game.render(mode) pixels = np.swapaxes(pixels, 0, 1) return pixels
from copy import deepcopy from env.cards import DynamicCorpus if __name__ == "__main__": start_iter = 50000 init_checkpoint = None num_epochs = 2000001 dim_states = 52 rl = RL(dim_states, lr_a=0.0001, lr_c=0.0001, init_checkpoint=init_checkpoint) # fine-tune if start_iter != 0 and not init_checkpoint: rl.load_model('rl', start_iter) env = Game() for episode in range(start_iter, num_epochs): env.reset() print() history_vec = [] history_pid = [] while 1: pid = env.now_player_id # 无人叫地主 or 游戏结束,记录所有存档 if env.landlord_count == 3 or env.winner >= 0: for i in range(3): state, f_reward, y_reward, act_ids, dyn_vec, _, label_mask, attn_mask = env.observe(pid) print('玩家', pid, '获得奖励', y_reward) pid = (pid + 1) % 3 env.now_player_id = pid
class TrainPipeline: """ 通过策略价值网络训练学习最优解 """ def __init__(self, board_width=8, board_height=8, n_in_row=5, init_modle=None): # 初始化棋盘和游戏服务器 self.board_width = board_width self.board_height = board_height self.n_in_row = n_in_row self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) self.loss_to_show = -1 self.entropy_to_show = -1 # 初始化训练所用的参数 self.learning_rate = 2e-3 self.lr_multiplier = 1.0 # 根据KL散度自动调整学习速率 self.temp = 1.0 self.n_playout = 400 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # 每次取batch_size进行梯度下降 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 # 一个间隔,取模为0时存储模型到硬盘 self.game_batch_num = 10000 # 最多进行10000局游戏 self.best_win_ratio = 0.0 # agent的对手,纯粹的mcts算法产生的棋手 self.pure_mcts_playout_num = 1000 # 是否加载原先已经存在的训练数据 if init_modle: self.policy_value_net = PolicyValueNet( board_width=self.board_width, board_height=self.board_height, model_file=init_modle) else: self.policy_value_net = PolicyValueNet( board_width=self.board_width, board_height=self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) self.logger = self.init_log() def init_log(self): """ 初始化日志 :return: """ cur_time = time.strftime('%m%d-%H:%M:%S', time.localtime(time.time())) logger_name = str(cur_time) logger = init_logger(name=logger_name) return logger def get_equi_data(self, play_data): """ 由于棋盘是上下左右对称的,所以我们可以通过翻转和旋转来获得更多的数据集 play_data: [(state, mcts_prob, winner_z), ..., ...] :param play_data: :return: """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """ 收集selfplay的数据用来训练 :param n_games: :return: """ for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) # 拓展数据集 play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) # 存入双向队列 def policy_update(self): """ 更新策略函数 :return: """ try: mini_batch = random.sample(self.data_buffer, self.batch_size) except: mini_batch = random.sample(list(self.data_buffer), self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch=state_batch, mcts_probs=mcts_probs_batch, winner_batch=winner_batch, learning_rate=self.learning_rate * self.lr_multiplier) self.loss_to_show = loss self.entropy_to_show = entropy new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 # explained_var_old = (1 - # np.var(np.array(winner_batch) - old_v.flatten()) / # np.var(np.array(winner_batch))) # explained_var_new = (1 - # np.var(np.array(winner_batch) - new_v.flatten()) / # np.var(np.array(winner_batch))) # print(("kl:{:.5f}," # "lr_multiplier:{:.3f}," # "loss:{}," # "entropy:{}," # "explained_var_old:{:.3f}," # "explained_var_new:{:.3f}" # ).format(kl, # self.lr_multiplier, # loss, # entropy, # explained_var_old, # explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=50): """ 与单纯的MCTS_Pure进行对抗训练,来监控当前策略的好坏 :param n_games: :return: """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] = win_cnt[winner] + 1 self.logger.info('round:{}\t, winner:{} '.format(i, winner)) win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games self.logger.info("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def run(self): """ 开始训练 :return: """ try: for i in range(self.game_batch_num): self.collect_selfplay_data(self.play_batch_size) self.logger.info( ("batch i:{},\t" "episode_len:{},\t" "loss:{:.8f},\t" "entropy:{:.8f},").format(i + 1, self.episode_len, self.loss_to_show, self.entropy_to_show)) # 数据量达到要求数目,就可以开始训练了 if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() if (i + 1) % self.check_freq == 0: self.logger.info("current self-play batch: {}".format(i + 1)) win_ratio = self.policy_evaluate() self.policy_value_net.save_model( 'model/current_policy.model') if win_ratio > self.best_win_ratio: self.logger.info( 'update new best policy, win_ratio: ' + str(win_ratio)) self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model( 'model/best_policy.model') if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: self.logger.info('quit')
def __init__(self, level='env/level.csv'): self.game = Game(level) self.repeat_frame_skip = 4
class TrainPipeline(): def __init__(self, init_model=None): """ init function for the class""" # params of the board and the game self.board_width = 6 # board width self.board_height = 6 # board height self.n_in_row = 4 # win by n in line (vertically, horizontally, diagonally) self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 # a number in (0, inf) controlling the relative impact of value Q, and prior probability P, on this node's score. self.buffer_size = 10000 # buffer size for replaying experience self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) # buffer self.play_batch_size = 1 # size of rollout for each episode self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # target of KL loss self.check_freq = 50 # frequency for check evaluation and save model self.game_batch_num = 1500 # number of training game loop self.best_win_ratio = 0.0 # best evaluated win ratio # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # load from existing file # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def get_equi_data(self, play_data): """augment the data set by rotation and flipping Description: We can increase the training data by simply rotating or flipping the state. In such a way, we can get more data to contribute to increasing the performance of training neural network. input params: play_data: type:List, [(state, mcts_prob, winner_z), ..., ...] """ extend_data = [] for state, mcts_porb, winner in play_data: for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) equi_mcts_prob = np.rot90( np.flipud( mcts_porb.reshape(self.board_height, self.board_width)), i) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) equi_mcts_prob = np.fliplr(equi_mcts_prob) extend_data.append( (equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def collect_selfplay_data(self, n_games=1): """collect self-play rollout data for training input param: n_games: number of rollout """ for i in range(n_games): winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) play_data = list(play_data)[:] self.episode_len = len(play_data) # augment the data play_data = self.get_equi_data(play_data) self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net by training net Pipeline: 1. sample data from the deque: self.data_buffer 2. compute action probability for original policy network 3. train neural network in a loop given sampled data loop pipeline: 1. call self.policy_value_net.train_step(state_batch, mcts_probs_batch, winner_batch, self.learn_rate*self.lr_multiplier) 2. compute action probability for new trained policy network 3. compute kl divergence between old and new action probability 4. if kl > self.kl_targ * 4, break the loop for Early Stopping 4. adjust learning rate based on kl divergence if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 # decrease learning rate elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 # increase learning rate 4. return final loss and entropy :return: loss: entropy: """ loss, entropy = None, None # TODO: code here return loss, entropy def policy_evaluate(self, n_games=10): """ Policy Evaluation Description: Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training Pipeline: 1. create MCTSPlayer and MCTS_Pure Player 2. Evaluation loop Pipeline: 1. Rollout simulation for AlphaZero vs Pure MCTS winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, # start from either Player 1 or 2 evenly is_shown=0) 2. Record result 3. compute winning ratio: win_ratio winning ratio = (winning times + 0.5 * tie times) / total times return: win_ratio """ # TODO: code here win_ratio = None return win_ratio def run(self): """run the training pipeline Descriptions: train alpha zero in a loop. loop size: self.game_batch_num loop pipline: 1. collect self-play data by rollouts 2. policy update by sampled training data 3. evaluated model performance (in a fixed frequency) 4. save model (in a fixed frequency) 5. evaluation result Plot """ try: # TODO: code here pass except KeyboardInterrupt: print('\n\rquit')
# swap observation observation = observation_ step += 1 # break while loop when end of this episode if done: break scores.append(env.score) if episode % 5 == 0: print("#" * 80) print(episode, ",", int(step / 10), ",score:", env.score, ",e:", RL.epsilon) print("avg-score: {}".format(np.mean(list(scores)[-1500:]))) if episode % 100 == 0: print(observation) env.show() if __name__ == "__main__": env = Game() RL = DuelingDQN(env.n_actions, env.n_features, learning_rate=1e-4, reward_decay=0.95, e_greedy=0.99, start_epsilon=0.5, e_greedy_increment=1e-5) train_2048()
class Env(): """ A wrapper for Tichu Game class to enable Reinforcement Learning. Brings a Tichu Game instance in a shape where an (RL-)Agent can: 1. Observe a state. 2. Take an action. 3. Recieve a reward. The state consists of infos from a Players perspective: [Players' hand size, Tichu Flag, Players' hand cards] [Opponent 1 hand size, Tichu Flag, Opponent 1 last move] [Teammate hand size, Tichu Flag, Teammates last move] [Opponent 2 hand size, Tichu Flag, Opponent 2 last move] The Cards are one-hot-encoded (OHE), e.g.: [1, 0, 0, ... 0, 0] is a OHE representation of 2 of Spades. There are alternative possibilites for the state-design which may be included in the future. The action is also a OHE of Cards, e.g.: [1, 0, 0, 0, 1, 0, ... 0] means play a pair of 2s. The reward function is designed two ways: Rich rewards means that a reward can be recieved after each step. A step is considered a move by all 4 players. In a rich reward setting, the reward is equal to the points in a Stack if the Stack is won by either the Player or its teammate. The same reward, but negative, is given to the opposing team. For Example: Player 0 wins a Stack containing 20 points. The rewards will be [20, -20, 20, -20] until the next step. Sparse rewards means that the rewards are only different from 0 when a game has finished. In this case, the rewards exactly match the outcome of a Game. For Example: Team 0 has achieved 60 points, Team 1 has achieved 40 points. Player 0 has successfully called Tichu (+100 points. The rewards will be [160, -60, 160, -60]. For both reward styles, an invalid move by a Player leads to an immediate negative reward. Attributes ---------- dispatch_reward: dictionary This is to set the reward function (rich/sparse. train_mode: bool Sets the verbosity of the Game. state_size: int The size of the state dimension. action_size: int The size of the action dimension. all_cards: list of Card A list containing instances of all Cards in a Tichu Deck. game: Game A Tichu Game instance. action_buffer: list of int A list containing the last actions of all Players. states: list of int A list of the states from all Players' perspectives. rewards: list of int The rewards that an Agent will recieved after a step. done: bool Whether the episode (i.e. Game) is finished. nstep: int An internal step conter used for rich rewards. Methods ------- reset(): Instantiates a new Game and resets state, action, rewards, done. step(player_id, action): Takes a step in the Game and updates state, action, rewards, done. """ def __init__(self, train_mode=True, illegal_move_penalty=ILLEGAL_MOVE_PENALTY): """ Constructs a Tichu Environment for RL. Parameter --------- train_mode: bool If false, verbosity of Game will be set to 1. """ # dispatch table for reward function self.dispatch_reward = { 'rich': self._update_rich_rewards, 'sparse': self._update_sparse_rewards } # set verbosity according to mode if train_mode: self.verbose = 0 else: self.verbose = 1 self.state_size = 232 self.action_size = 56 self.all_cards = Deck().all_cards self.game = None self.action_buffer = [[None], [None], [None], [None]] self.state = [[None], [None], [None], [None]] self.rewards = [None, None, None, None] self.done = False self.illegal_move_penalty = illegal_move_penalty self.nstep = 0 # only relevant for rich rewards def reset(self): """ Resets the Environment. """ self.game = Game(verbose=self.verbose) self._reset_all_states() self._reset_action_buffer() self._reset_rewards() self.done = False state = self.state rewards = self.rewards done = self.done active_player = self.game.active_player return state, rewards, done, active_player def step(self, player_id, action): """ Takes a step in the Game. Updates state, action, rewards, done and returns them. Paramter -------- player_id: The id (0...3) of the player that makes a move. action: The action of the player as OHE Cards representation. """ # convert action vector and make game step cards = self._vec_to_cards(action) suc, points_this_step = self.game.step(player_id, cards) # illegal move if not suc: self.rewards[player_id] = self.illegal_move_penalty # legal move else: self._update_action_buffer(player_id, action) self._update_all_states() # reset state and action buffer if stack has been emptied # and update rewards according to points in the stack if not self.game.stack.cards: self._reset_all_states() self._reset_action_buffer() self._update_rewards(points_this_step) # update rewards for pass move elif cards.type == 'pass': self._update_rewards(points_this_step) # reset state, action_buffer and rewards if Dog has been played # (required because Dog skips players) elif cards.cards[0].name == 'Dog': self._reset_all_states() self._reset_action_buffer() self._reset_rewards() # update rewards for regular game move else: self._update_rewards(points_this_step) # check if game is finished if self.game.game_finished: self.done = True # return step variables state = self.state rewards = self.rewards done = self.done active_player = self.game.active_player return state, rewards, done, active_player def info(self): """ Outputs size of state and action dimension. """ return self.state_size, self.action_size def _reset_all_states(self): """ Resets the state to the initial setting. Initial game state of player i: i: [hand_size, tichu_flag, hand_cards (OHE)] i + 1: [hand_size, tichu_flag, played_cards (OHE)] i + 2: [hand_size, tichu_flag, played_cards (OHE)] i + 3: [hand_size, tichu_flag, played_cards (OHE)] """ self.state = list() for i in range(4): this_player = i player_state = list() for j in range(4): pid = (this_player + j) % 4 hand_size = self.game.players[pid].hand_size tichu_flag = int(self.game.players[pid].tichu_flag) if pid == this_player: player_cards = self._cards_to_vec( self.game.players[pid].hand) else: player_cards = np.zeros(len(self.all_cards), int).tolist() player_state.append([hand_size, tichu_flag, player_cards]) self.state.append(player_state) def _update_all_states(self): """ Updates states with latest action taken by other players. """ self.state = list() for i in range(4): this_player = i player_state = list() for j in range(4): pid = (this_player + j) % 4 hand_size = self.game.players[pid].hand_size tichu_flag = int(self.game.players[pid].tichu_flag) if pid == this_player: player_cards = self._cards_to_vec( self.game.players[pid].hand) else: player_cards = self.action_buffer[pid] player_state.append([hand_size, tichu_flag, player_cards]) self.state.append(player_state) def _reset_action_buffer(self): """ Resets the action buffer. """ for i in range(4): self.action_buffer[i] = np.zeros(len(self.all_cards), int).tolist() def _update_action_buffer(self, player_id, action): """ Updates the action buffer. """ self.action_buffer[player_id] = action.tolist() def _reset_rewards(self): """ Resets the rewards to 0. """ self.rewards = [0, 0, 0, 0] self.nstep = self.game.active_player def _update_rewards(self, points_this_step): """ Updates the rewards according to reward style. """ self.dispatch_reward[REWARD_STYLE](points_this_step) def _update_rich_rewards(self, points_this_step): """ Updates the rewards according to a rich reward function. This implemenation of a reward function promises rewards after each round (i.e. consecutive steps of all 4 players). If a player or its teammate (!) gets points during a round (e.g. by winning a stack), it gets a reward in the amount of the points in this round. The benefit of this reward function is that each step promises a reward (i.e. no sparse rewards that may impede learning). The danger is that the actual points are assigned at the end of a game, which means the last player looses all its points to the first finisher. This may lead to a non-ideal game strategy, where lots of rewards might be collected during the game, but actually the game is lost if the player does not finish early. Also, cummulative reward is higher for players that finish later. However, if the winning team gets more cumulative reward, then this reward design will still lead to a good policy. """ # reset rewards every new player round self.rewards[self.nstep] = 0 # accumulate rewards (teammate rewards are also taken into account) # opponent rewards are considered negative rewards_team_0 = (points_this_step[0] + points_this_step[2]) rewards_team_1 = (points_this_step[1] + points_this_step[3]) self.rewards[0] += (rewards_team_0 - rewards_team_1) self.rewards[1] += (rewards_team_1 - rewards_team_0) self.rewards[2] += (rewards_team_0 - rewards_team_1) self.rewards[3] += (rewards_team_1 - rewards_team_0) # update nstep counter self.nstep = (self.nstep + 1) % 4 def _update_sparse_rewards(self, points_this_step): """ Updates the rewards according to a sparse reward function. Sparse rewards means that rewards are only achived when a Game is completed. The benefit is that the rewards exactly represent the outcome of the Game. The danger is that it is hard for an Agent to make sense of its actions when the rewards come only at the end of an episode. The sparse rewards are not yet implemented! """ raise NotImplementedError("TODO") def _cards_to_vec(self, cards): """ Turns a Cards instance into a vector representation. """ vec = np.zeros(len(self.all_cards), int) for i in range(len(self.all_cards)): crd = Cards([self.all_cards[i]]) if cards.contains(crd): vec[i] = 1 return vec.tolist() def _vec_to_cards(self, vec): """ Turns a vector representation into a Cards instance. """ return Cards(list(compress(self.all_cards, vec)))