def inference_minor_util60(role_id, handcards, num, is_pair, dup_mask, main_cards_char): for main_card in main_cards_char: handcards.remove(main_card) s = get_mask(handcards, action_space, None).astype(np.float32) outputs = [] minor_type = 1 if is_pair else 0 for i in range(num): input_single, input_pair, _, _ = get_masks(handcards, None) _, _, _, _, _, _, minor_response_prob = func( [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.array([minor_type])] ) # give minor cards mask = None if is_pair: mask = np.concatenate([input_pair, [0, 0]]) * dup_mask else: mask = input_single * dup_mask minor_response = take_action_from_prob(minor_response_prob, mask) dup_mask[minor_response] = 0 # convert network output to char cards handcards.remove(to_char(minor_response + 3)) if is_pair: handcards.remove(to_char(minor_response + 3)) s = get_mask(handcards, action_space, None).astype(np.float32) # save to output outputs.append(to_char(minor_response + 3)) if is_pair: outputs.append(to_char(minor_response + 3)) return outputs
def player_cards(self): other_two = self.get_last_two_handcards() curr_idx = self.get_current_idx() return { self.agent_names[(curr_idx + 2) % 3]: to_char(other_two[1]), self.agent_names[(curr_idx + 1) % 3]: to_char(other_two[0]), self.agent_names[curr_idx]: self.get_curr_handcards() }
def get_state_and_action_spaces(self, action=None): def cards_char2embedding(cards_char): test = (action_space_onehot60 == Card.char2onehot60(cards_char)) test = np.all(test, axis=1) target = np.where(test)[0] return self.encoding[target[0]] last_two_cards_char = self.player.get_last_two_cards() last_two_cards_char = [to_char(cards) for cards in last_two_cards_char] last_cards_char = last_two_cards_char[0] if not last_cards_char: last_cards_char = last_two_cards_char[1] curr_cards_char = to_char(self.player.get_curr_handcards()) if self._comb_mask: # print(curr_cards_char, last_cards_char) combs = self.get_combinations(curr_cards_char, last_cards_char) if len(combs) > self.num_actions[0]: combs, self._fine_mask = self.subsample_combs_masks(combs, self._fine_mask, self.num_actions[0]) # TODO: utilize temporal relations to speedup available_actions = [[action_space[idx] for idx in comb] for comb in combs] # print(available_actions) # print('-------------------------------------------') assert len(combs) > 0 if self._fine_mask is not None: self._fine_mask = self.pad_fine_mask(self._fine_mask) self.pad_action_space(available_actions) state = [np.stack([self.encoding[idx] for idx in comb]) for comb in combs] assert len(state) > 0 prob_state = self.player.get_state_prob() # test = action_space_onehot60 == Card.char2onehot60(last_cards_char) # test = np.all(test, axis=1) # target = np.where(test)[0] # assert target.size == 1 extra_state = np.concatenate([cards_char2embedding(last_two_cards_char[0]), cards_char2embedding(last_two_cards_char[1]), prob_state]) for i in range(len(state)): state[i] = np.concatenate([state[i], np.tile(extra_state[None, :], [state[i].shape[0], 1])], axis=-1) state = self.pad_state(state) assert state.shape[0] == self.num_actions[0] and state.shape[1] == self.num_actions[1] else: assert action is not None if self._fine_mask is not None: self._fine_mask = self._fine_mask[action] available_actions = self._action_space[action] state = self._current_ob[action:action+1, :, :] state = np.repeat(state, self.num_actions[0], axis=0) assert state.shape[0] == self.num_actions[0] and state.shape[1] == self.num_actions[1] return state, available_actions
def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob if self.rng.rand() <= self.exploration: act = self.rng.choice(range(self.num_actions)) else: mask = get_mask(to_char(self.player.get_curr_handcards()), action_space, to_char(self.player.get_last_outcards())) q_values = self.predictor(old_s[None, ...])[0][0] q_values[mask == 0] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions reward, isOver, _ = self.player.step_manual(to_value( action_space[act])) # step for AI while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if ROLE_ID_TO_TRAIN == 2: reward = -reward self._current_game_score.feed(reward) if isOver: # print('lord wins' if reward > 0 else 'farmer wins') self._player_scores.feed(self._current_game_score.sum) # print(self._current_game_score.sum) while True: self.player.reset() # init_cards = np.arange(36) # self.player.prepare_manual(init_cards) self.player.prepare() early_stop = False while self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if isOver: print('prestart ends too early! now resetting env') early_stop = True break if early_stop: continue self._current_ob = self.get_state() break self._current_game_score.reset() self._current_ob = self.get_state() self.mem.append(Experience(old_s, act, reward, isOver))
def play_one_episode(env, func): env.reset() env.prepare() r = 0 while r == 0: role_id = env.get_role_ID() if role_id == ROLE_ID_TO_TRAIN: s = get_state(env) mask = get_mask(to_char(env.get_curr_handcards()), action_space, to_char(env.get_last_outcards())) q_values = func(s[None, ...])[0][0] q_values[mask == 0] = np.nan act = np.nanargmax(q_values) intention = to_value(action_space[act]) r, _, _ = env.step_manual(intention) else: intention, r, _ = env.step_auto() return int(r > 0)
def play_one_episode(env, func): env.reset() env.prepare() r = 0 while r == 0: role_id = env.get_role_ID() if role_id == ROLE_ID_TO_TRAIN: handcards = to_char(env.get_curr_handcards()) last_two_cards = env.get_last_two_cards() last_two_cards = [to_char(cards) for cards in last_two_cards] prob_state = env.get_state_prob() # print(agent, handcards) action = func.predict(handcards, last_two_cards, prob_state) # print(agent, ' gives ', action) intention = to_value(action) r, _, _ = env.step_manual(intention) else: intention, r, _ = env.step_auto() return int(r > 0)
def step_auto(self): idx = self.get_current_idx() # print(idx) intention, r, _ = super().step_auto() intention = to_char(intention) if len(intention) > 0: self.controller = self.agent_names[idx] # print(self.agent_names[idx], 'gives', intention, self.controller) assert np.all(self.get_state_prob() >= 0) and np.all(self.get_state_prob() <= 1) # print(intention) return r, r != 0
def get_state(env): def cards_char2embedding(cards_char): test = (action_space_onehot60 == Card.char2onehot60(cards_char)) test = np.all(test, axis=1) target = np.where(test)[0] return encoding[target[0]] s = env.get_state_prob() s = np.concatenate([Card.val2onehot60(env.get_curr_handcards()), s]) last_two_cards_char = env.get_last_two_cards() last_two_cards_char = [to_char(c) for c in last_two_cards_char] return np.concatenate( [s, cards_char2embedding(last_two_cards_char[0]), cards_char2embedding(last_two_cards_char[1])])
def play_one_episode(env, func, role_id): env.reset() env.prepare() r = 0 while r == 0: if env.get_role_ID() == role_id: handcards = to_char(env.get_curr_handcards()) last_two_cards = env.get_last_two_cards() last_two_cards = [to_char(cards) for cards in last_two_cards] prob_state = env.get_state_prob() # print(agent, handcards) action = func.predict(handcards, last_two_cards, prob_state) # print(agent, ' gives ', action) intention = to_value(action) r, _, _ = env.step_manual(intention) # print('lord gives', to_char(intention), file=f) assert (intention is not None) else: intention, r, _ = env.step_auto() return int(r > 0)
def ai_play(): data = request.json print(data) pos = int(data['current_player']) player_cards = data['player_cards'] my_cards = trans_cards(player_cards.split("|")[pos]) last_move = trans_cards(data['last_move']) if int(data['last_player']) == int(data['current_player']): last_move = [] else: last_move = trans_cards(data['last_move']) intention = to_char( CEnv.step_auto_static(Card.char2color(my_cards), to_value(last_move))) res = trans_cards_reverse(intention) if res == "": res = 'P' print("result is {}".format(res)) return jsonify({'move': res})
def get_mask(self): if self.act == ACT_TYPE.PASSIVE: decision_mask, response_mask, bomb_mask, _ = get_mask_alter( self.curr_handcards_char, to_char(self.last_cards_value), self.category) if self.mode == MODE.PASSIVE_DECISION: return decision_mask elif self.mode == MODE.PASSIVE_RESPONSE: return response_mask elif self.mode == MODE.PASSIVE_BOMB: return bomb_mask elif self.mode == MODE.MINOR_RESPONSE: input_single, input_pair, _, _ = get_masks( self.curr_handcards_char, None) if self.minor_type == 1: mask = np.append(input_pair, [0, 0]) else: mask = input_single for v in set(self.intention): mask[v - 3] = 0 return mask elif self.act == ACT_TYPE.ACTIVE: decision_mask, response_mask, _, length_mask = get_mask_alter( self.curr_handcards_char, [], self.category) if self.mode == MODE.ACTIVE_DECISION: return decision_mask elif self.mode == MODE.ACTIVE_RESPONSE: return response_mask[self.active_decision] elif self.mode == MODE.ACTIVE_SEQ: return length_mask[self.active_decision][self.active_response] elif self.mode == MODE.MINOR_RESPONSE: input_single, input_pair, _, _ = get_masks( self.curr_handcards_char, None) if self.minor_type == 1: mask = np.append(input_pair, [0, 0]) else: mask = input_single for v in set(self.intention): mask[v - 3] = 0 return mask
def run(self): player = self._build_player() context = zmq.Context() c2s_socket = context.socket(zmq.PUSH) c2s_socket.setsockopt(zmq.IDENTITY, self.identity) c2s_socket.set_hwm(10) c2s_socket.connect(self.c2s) s2c_socket = context.socket(zmq.DEALER) s2c_socket.setsockopt(zmq.IDENTITY, self.identity) s2c_socket.connect(self.s2c) player.reset() # init_cards = np.arange(52) # init_cards = np.append(init_cards[::4], init_cards[1::4]) # player.prepare_manual(init_cards) player.prepare() r, is_over = 0, False lstm_state = np.zeros([1024 * 2]) while True: role_id = player.get_role_ID() if role_id in ROLE_IDS_TO_TRAIN: prob_state, all_state, curr_handcards_value, last_cards_value, last_category = \ player.get_state_prob(), player.get_state_all_cards(), player.get_curr_handcards(), player.get_last_outcards(), player.get_last_outcategory_idx() prob_state = np.concatenate( [Card.val2onehot60(curr_handcards_value), prob_state]) # after taking the last action, get to this state and get this reward/isOver. # If isOver, get to the next-episode state immediately. # This tuple is not the same as the one put into the memory buffer is_active = False if last_cards_value.size > 0 else True mask = get_mask( to_char(curr_handcards_value), action_space, None if is_active else to_char(last_cards_value)) if is_active: mask[0] = 0 last_two_cards = player.get_last_two_cards() last_two_cards_onehot = np.concatenate([ Card.val2onehot60(last_two_cards[0]), Card.val2onehot60(last_two_cards[1]) ]) c2s_socket.send(dumps( (self.identity, role_id, prob_state, all_state, last_two_cards_onehot, mask, 0 if is_active else 1, lstm_state, r, is_over)), copy=False) action_idx, lstm_state = loads( s2c_socket.recv(copy=False).bytes) r, is_over, _ = player.step_manual( to_value(action_space[action_idx])) else: _, r, _ = player.step_auto() is_over = (r != 0) if is_over: # print('{} over with reward {}'.format(self.identity, r)) # logger.info('{} over with reward {}'.format(self.identity, r)) # sys.stdout.flush() player.reset() player.prepare() lstm_state = np.zeros([1024 * 2])
def get_last_two_cards(self): last_two_cards = super().get_last_two_cards() last_two_cards = [to_char(c) for c in last_two_cards] return last_two_cards
def get_last_outcards(self): return to_char(super().get_last_outcards())
def intention(self, env): intention = to_char( CEnv.step_auto_static(Card.char2color(env.get_curr_handcards()), to_value(env.get_last_outcards()))) return intention
def data_generator(rng): env = Env(rng.randint(1 << 31)) # logger.info('called') while True: env.reset() env.prepare() r = 0 while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_out_cards = Card.val2onehot60(last_cards_value) last_category_idx = env.get_last_outcategory_idx() curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = env.get_state_prob() # s = s[:60] intention, r, category_idx = env.step_auto() if category_idx == 14: continue minor_cards_targets = pick_minor_targets(category_idx, to_char(intention)) # self, state, last_cards, passive_decision_target, passive_bomb_target, passive_response_target, # active_decision_target, active_response_target, seq_length_target, minor_response_target, minor_type, mode if not is_active: if category_idx == Category.QUADRIC.value and category_idx != last_category_idx: passive_decision_input = 1 passive_bomb_input = intention[0] - 3 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 yield s, last_out_cards, 0, passive_bomb_input, 0, 0, 0, 0, 0, 0, 1 else: if category_idx == Category.BIGBANG.value: passive_decision_input = 2 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 else: if category_idx != Category.EMPTY.value: passive_decision_input = 3 # OFFSET_ONE # 1st, Feb - remove relative card output since shift is hard for the network to learn passive_response_input = intention[0] - 3 if passive_response_input < 0: print("something bad happens") passive_response_input = 0 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 yield s, last_out_cards, 0, 0, passive_response_input, 0, 0, 0, 0, 0, 2 else: passive_decision_input = 0 yield s, last_out_cards, passive_decision_input, 0, 0, 0, 0, 0, 0, 0, 0 else: seq_length = get_seq_length(category_idx, intention) # ACTIVE OFFSET ONE! active_decision_input = category_idx - 1 active_response_input = intention[0] - 3 yield s, last_out_cards, 0, 0, 0, active_decision_input, 0, 0, 0, 0, 3 yield s, last_out_cards, 0, 0, 0, 0, active_response_input, 0, 0, 0, 4 if seq_length is not None: # length offset one seq_length_input = seq_length - 1 yield s, last_out_cards, 0, 0, 0, 0, 0, seq_length_input, 0, 0, 5 if minor_cards_targets is not None: main_cards = pick_main_cards(category_idx, to_char(intention)) handcards = curr_cards_char.copy() state = s.copy() for main_card in main_cards: handcards.remove(main_card) cards_onehot = Card.char2onehot60(main_cards) # we must make the order in each 4 batch correct... discard_onehot_from_s_60(state, cards_onehot) is_pair = False minor_type = 0 if category_idx == Category.THREE_TWO.value or category_idx == Category.THREE_TWO_LINE.value: is_pair = True minor_type = 1 for target in minor_cards_targets: target_val = Card.char2value_3_17(target) - 3 yield state.copy( ), last_out_cards, 0, 0, 0, 0, 0, 0, target_val, minor_type, 6 cards = [target] handcards.remove(target) if is_pair: if target not in handcards: print('something wrong...') print('minor', target) print('main_cards', main_cards) print('handcards', handcards) print('intention', intention) print('category_idx', category_idx) else: handcards.remove(target) cards.append(target) # correct for one-hot state cards_onehot = Card.char2onehot60(cards) # print(s.shape) # print(cards_onehot.shape) discard_onehot_from_s_60(state, cards_onehot)
def play_one_episode(env, func): def take_action_from_prob(prob, mask): prob = prob[0] # to avoid numeric difficulty prob[mask == 0] = -1 return np.argmax(prob) env.reset() # init_cards = np.arange(52) # init_cards = np.append(init_cards[::4], init_cards[1::4]) # env.prepare_manual(init_cards) env.prepare() r = 0 lstm_state = np.zeros([1024 * 2]) while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_two_cards = env.get_last_two_cards() last_two_cards_onehot = np.concatenate([ Card.val2onehot60(last_two_cards[0]), Card.val2onehot60(last_two_cards[1]) ]) curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = env.get_state_prob() s = np.concatenate([Card.char2onehot60(curr_cards_char), s]) # print(s.shape) role_id = env.get_role_ID() # print('%s current cards' % ('lord' if role_id == 2 else 'farmer'), curr_cards_char) if role_id in ROLE_IDS_TO_TRAIN: if is_active: # first get mask mask = get_mask(curr_cards_char, action_space, None) # not valid for active mask[0] = 0 active_prob, _, lstm_state = func(np.array([role_id]), s.reshape(1, -1), np.zeros([1, 120]), lstm_state.reshape(1, -1)) # make decision depending on output action_idx = take_action_from_prob(active_prob, mask) else: # print('last cards char', last_cards_char) mask = get_mask(curr_cards_char, action_space, last_cards_char) _, passive_prob, lstm_state = func( np.array([role_id]), s.reshape(1, -1), last_two_cards_onehot.reshape(1, -1), lstm_state.reshape(1, -1)) action_idx = take_action_from_prob(passive_prob, mask) # since step auto needs full last card group info, we do not explicitly feed card type intention = to_value(action_space[action_idx]) r, _, _ = env.step_manual(intention) # print('lord gives', to_char(intention)) assert (intention is not None) else: intention, r, _ = env.step_auto() # print('farmer gives', to_char(intention)) # if r > 0: # print('farmer wins') # else: # print('lord wins') return int(r > 0)
def play_one_episode(env, func): def take_action_from_prob(prob, mask): prob = prob[0] # to avoid numeric difficulty prob[mask == 0] = -1 return np.argmax(prob) # return char minor cards output def inference_minor_util60(role_id, handcards, num, is_pair, dup_mask, main_cards_char): for main_card in main_cards_char: handcards.remove(main_card) s = get_mask(handcards, action_space, None).astype(np.float32) outputs = [] minor_type = 1 if is_pair else 0 for i in range(num): input_single, input_pair, _, _ = get_masks(handcards, None) _, _, _, _, _, _, minor_response_prob = func( [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.array([minor_type])] ) # give minor cards mask = None if is_pair: mask = np.concatenate([input_pair, [0, 0]]) * dup_mask else: mask = input_single * dup_mask minor_response = take_action_from_prob(minor_response_prob, mask) dup_mask[minor_response] = 0 # convert network output to char cards handcards.remove(to_char(minor_response + 3)) if is_pair: handcards.remove(to_char(minor_response + 3)) s = get_mask(handcards, action_space, None).astype(np.float32) # save to output outputs.append(to_char(minor_response + 3)) if is_pair: outputs.append(to_char(minor_response + 3)) return outputs def inference_minor_cards60(role_id, category, s, handcards, seq_length, dup_mask, main_cards_char): if category == Category.THREE_ONE.value: return inference_minor_util60(role_id, handcards, 1, False, dup_mask, main_cards_char) if category == Category.THREE_TWO.value: return inference_minor_util60(role_id, handcards, 1, True, dup_mask, main_cards_char) if category == Category.THREE_ONE_LINE.value: return inference_minor_util60(role_id, handcards, seq_length, False, dup_mask, main_cards_char) if category == Category.THREE_TWO_LINE.value: return inference_minor_util60(role_id, handcards, seq_length, True, dup_mask, main_cards_char) if category == Category.FOUR_TWO.value: return inference_minor_util60(role_id, handcards, 2, False, dup_mask, main_cards_char) env.reset() init_cards = np.arange(21) # init_cards = np.append(init_cards[::4], init_cards[1::4]) env.prepare_manual(init_cards) r = 0 while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_out_cards = Card.val2onehot60(last_cards_value) last_category_idx = env.get_last_outcategory_idx() curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = get_mask(curr_cards_char, action_space, None if is_active else last_cards_char).astype(np.float32) last_state = get_mask(last_cards_char, action_space, None).astype(np.float32) # print(s.shape) role_id = env.get_role_ID() # print('%s current cards' % ('lord' if role_id == 2 else 'farmer'), curr_cards_char) intention = None if role_id == 2: if is_active: # first get mask decision_mask, response_mask, _, length_mask = get_mask_alter(curr_cards_char, [], last_category_idx) _, _, _, active_decision_prob, active_response_prob, active_seq_prob, _ = func( [np.array([role_id]), s.reshape(1, -1), np.zeros([1, 9085]), np.zeros([s.shape[0]])] ) # make decision depending on output active_decision = take_action_from_prob(active_decision_prob, decision_mask) active_category_idx = active_decision + 1 # get response active_response = take_action_from_prob(active_response_prob, response_mask[active_decision]) seq_length = 0 # next sequence length if active_category_idx == Category.SINGLE_LINE.value or \ active_category_idx == Category.DOUBLE_LINE.value or \ active_category_idx == Category.TRIPLE_LINE.value or \ active_category_idx == Category.THREE_ONE_LINE.value or \ active_category_idx == Category.THREE_TWO_LINE.value: seq_length = take_action_from_prob(active_seq_prob, length_mask[active_decision][active_response]) + 1 # give main cards intention = give_cards_without_minor(active_response, last_cards_value, active_category_idx, seq_length) # then give minor cards if active_category_idx == Category.THREE_ONE.value or \ active_category_idx == Category.THREE_TWO.value or \ active_category_idx == Category.THREE_ONE_LINE.value or \ active_category_idx == Category.THREE_TWO_LINE.value or \ active_category_idx == Category.FOUR_TWO.value: dup_mask = np.ones([15]) if seq_length > 0: for i in range(seq_length): dup_mask[intention[0] - 3 + i] = 0 else: dup_mask[intention[0] - 3] = 0 intention = np.concatenate([intention, to_value(inference_minor_cards60(role_id, active_category_idx, s.copy(), curr_cards_char.copy(), seq_length, dup_mask, to_char(intention)))]) else: # print(to_char(last_cards_value), is_bomb, last_category_idx) decision_mask, response_mask, bomb_mask, _ = get_mask_alter(curr_cards_char, to_char(last_cards_value), last_category_idx) passive_decision_prob, passive_bomb_prob, passive_response_prob, _, _, _, _ = func( [np.array([role_id]), s.reshape(1, -1), last_state.reshape(1, -1), np.zeros([s.shape[0]])]) passive_decision = take_action_from_prob(passive_decision_prob, decision_mask) if passive_decision == 0: intention = np.array([]) elif passive_decision == 1: passive_bomb = take_action_from_prob(passive_bomb_prob, bomb_mask) # converting 0-based index to 3-based value intention = np.array([passive_bomb + 3] * 4) elif passive_decision == 2: intention = np.array([16, 17]) elif passive_decision == 3: passive_response = take_action_from_prob(passive_response_prob, response_mask) intention = give_cards_without_minor(passive_response, last_cards_value, last_category_idx, None) if last_category_idx == Category.THREE_ONE.value or \ last_category_idx == Category.THREE_TWO.value or \ last_category_idx == Category.THREE_ONE_LINE.value or \ last_category_idx == Category.THREE_TWO_LINE.value or \ last_category_idx == Category.FOUR_TWO.value: dup_mask = np.ones([15]) seq_length = get_seq_length(last_category_idx, last_cards_value) if seq_length: for i in range(seq_length): dup_mask[intention[0] - 3 + i] = 0 else: dup_mask[intention[0] - 3] = 0 intention = np.concatenate([intention, to_value(inference_minor_cards60(role_id, last_category_idx, s.copy(), curr_cards_char.copy(), seq_length, dup_mask, to_char(intention)))]) # since step auto needs full last card group info, we do not explicitly feed card type r, _, _ = env.step_manual(intention) # print('lord gives', to_char(intention)) assert (intention is not None) else: intention, r, _ = env.step_auto() # print('farmer gives', to_char(intention)) # if r > 0: # print('farmer wins') # else: # print('lord wins') return int(r > 0)
def state(self): return get_mask( self.handcards_char, action_space, None if self.act == ACT_TYPE.ACTIVE else to_char( self.last_cards_value)).astype(np.float32)
def step(self, action): if self.act == ACT_TYPE.PASSIVE: if self.mode == MODE.PASSIVE_DECISION: if action == 0 or action == 2: self.finished = True if action == 2: self.intention = np.array([16, 17]) self.card_type = Category.BIGBANG.value else: self.card_type = Category.EMPTY.value return elif action == 1: self.mode = MODE.PASSIVE_BOMB return elif action == 3: self.mode = MODE.PASSIVE_RESPONSE return else: raise Exception('unexpected action') elif self.mode == MODE.PASSIVE_BOMB: # convert to value input self.intention = np.array([action + 3] * 4) self.finished = True self.card_type = Category.QUADRIC.value return elif self.mode == MODE.PASSIVE_RESPONSE: self.intention = give_cards_without_minor( action, self.last_cards_value, self.category, None) if self.category == Category.THREE_ONE.value or \ self.category == Category.THREE_TWO.value or \ self.category == Category.THREE_ONE_LINE.value or \ self.category == Category.THREE_TWO_LINE.value or \ self.category == Category.FOUR_TWO.value: if self.category == Category.THREE_TWO.value or self.category == Category.THREE_TWO_LINE.value: self.minor_type = 1 self.mode = MODE.MINOR_RESPONSE # modify the state for minor cards # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(self.intention)) intention_char = to_char(self.intention) for c in intention_char: self.handcards_char.remove(c) self.minor_length = get_seq_length(self.category, self.last_cards_value) if self.minor_length is None: self.minor_length = 2 if self.category == Category.FOUR_TWO.value else 1 self.card_type = self.category return else: self.finished = True self.card_type = self.category return elif self.mode == MODE.MINOR_RESPONSE: minor_value_cards = [action + 3 ] * (1 if self.minor_type == 0 else 2) # modify the state for minor cards minor_char = to_char(minor_value_cards) for c in minor_char: self.handcards_char.remove(c) # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(minor_value_cards)) self.intention = np.append(self.intention, minor_value_cards) assert self.minor_length > 0 self.minor_length -= 1 if self.minor_length == 0: self.finished = True return else: return elif self.act == ACT_TYPE.ACTIVE: if self.mode == MODE.ACTIVE_DECISION: self.category = action + 1 self.active_decision = action self.mode = MODE.ACTIVE_RESPONSE self.card_type = self.category return elif self.mode == MODE.ACTIVE_RESPONSE: if self.category == Category.SINGLE_LINE.value or \ self.category == Category.DOUBLE_LINE.value or \ self.category == Category.TRIPLE_LINE.value or \ self.category == Category.THREE_ONE_LINE.value or \ self.category == Category.THREE_TWO_LINE.value: self.active_response = action self.mode = MODE.ACTIVE_SEQ return elif self.category == Category.THREE_ONE.value or \ self.category == Category.THREE_TWO.value or \ self.category == Category.FOUR_TWO.value: if self.category == Category.THREE_TWO.value or self.category == Category.THREE_TWO_LINE.value: self.minor_type = 1 self.mode = MODE.MINOR_RESPONSE self.intention = give_cards_without_minor( action, np.array([]), self.category, None) # modify the state for minor cards intention_char = to_char(self.intention) for c in intention_char: self.handcards_char.remove(c) # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(self.intention)) self.minor_length = 2 if self.category == Category.FOUR_TWO.value else 1 return else: self.intention = give_cards_without_minor( action, np.array([]), self.category, None) self.finished = True return elif self.mode == MODE.ACTIVE_SEQ: self.minor_length = action + 1 self.intention = give_cards_without_minor( self.active_response, np.array([]), self.category, action + 1) if self.category == Category.THREE_ONE_LINE.value or \ self.category == Category.THREE_TWO_LINE.value: if self.category == Category.THREE_TWO.value or self.category == Category.THREE_TWO_LINE.value: self.minor_type = 1 self.mode = MODE.MINOR_RESPONSE # modify the state for minor cards intention_char = to_char(self.intention) for c in intention_char: self.handcards_char.remove(c) # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(self.intention)) else: self.finished = True return elif self.mode == MODE.MINOR_RESPONSE: minor_value_cards = [action + 3 ] * (1 if self.minor_type == 0 else 2) # modify the state for minor cards minor_char = to_char(minor_value_cards) for c in minor_char: self.handcards_char.remove(c) # discard_onehot_from_s_60(self.prob_state, Card.val2onehot60(minor_value_cards)) self.intention = np.append(self.intention, minor_value_cards) assert self.minor_length > 0 self.minor_length -= 1 if self.minor_length == 0: self.finished = True return else: return
def step_auto(self): intention, r, _ = super().step_auto() intention = to_char(intention) assert np.all(self.get_state_prob() >= 0) and np.all(self.get_state_prob() <= 1) # print(intention) return r, r != 0
class Env: total_cards = sorted(to_char(np.arange(3, 16)) * 4 + ['*', '$'], key=lambda k: Card.cards_to_value[k]) def __init__(self, agent_names=('agent1', 'agent2', 'agent3')): seed = (id(self) + int(datetime.now().strftime("%Y%m%d%H%M%S%f"))) % 4294967295 np.random.seed(seed) self.agent_names = agent_names self.reset() def get_all_agent_names(self): return self.agent_names def get_curr_agent_name(self): return self.curr_player def reset(self): self.histories = {n: [] for n in self.agent_names} self.player_cards = {n: [] for n in self.agent_names} self.extra_cards = [] self.lord = None self.controller = None self.last_cards_char = [] self.out_cards = [[] for _ in range(3)] self.curr_player = None def get_role_ID(self): curr_idx = self.get_current_idx() assert 0 <= curr_idx <= 2 if curr_idx == 0: return 2 if curr_idx == 1: return 3 return 1 def get_current_idx(self): return self.agent_names.index(self.curr_player) def prepare(self): cards = Env.total_cards.copy() np.random.shuffle(cards) self.extra_cards = cards[17:20] self.player_cards[self.agent_names[0]] = sorted(cards[:20], key=lambda k: Card.cards_to_value[k]) self.player_cards[self.agent_names[1]] = sorted(cards[20:37], key=lambda k: Card.cards_to_value[k]) self.player_cards[self.agent_names[2]] = sorted(cards[37:], key=lambda k: Card.cards_to_value[k]) self.lord = self.agent_names[0] self.controller = self.lord self.curr_player = self.lord def step(self, intention): print(self.get_curr_agent_name() + str(self.get_curr_handcards()) + " play:") print(str(intention)) self.out_cards[self.agent_names.index(self.curr_player)] = intention if len(intention) == 0: self.curr_player = self.agent_names[(self.agent_names.index(self.curr_player) + 1) % len(self.agent_names)] return self.curr_player, False else: self.last_cards_char = intention self.controller = self.curr_player for card in intention: self.player_cards[self.curr_player].remove(card) self.histories[self.curr_player].extend(intention) if len(self.player_cards[self.curr_player]) == 0: if 'agent1' == self.curr_player: print("winner is landlord ") else: print("winner is fammer ") return self.curr_player, True else: self.curr_player = self.agent_names[ (self.agent_names.index(self.curr_player) + 1) % len(self.agent_names)] return self.curr_player, False def get_last_outcards(self): return self.last_cards_char.copy() if self.curr_player != self.controller else [] def get_last_two_cards(self): return [self.out_cards[(self.agent_names.index(self.curr_player) + 2) % len(self.agent_names)].copy(), self.out_cards[(self.agent_names.index(self.curr_player) + 1) % len(self.agent_names)].copy()] def get_curr_handcards(self): return self.player_cards[self.curr_player].copy() def get_state_prob(self): total_cards = np.ones([60]) total_cards[53:56] = 0 total_cards[57:60] = 0 player_idx = self.get_current_idx() remain_cards = total_cards - Card.char2onehot60(self.get_curr_handcards() + self.histories[self.agent_names[player_idx]] + self.histories[self.agent_names[(player_idx + 1) % 3]] + self.histories[self.agent_names[(player_idx + 2) % 3]]) # sanity check # remain_cards_check = Card.char2onehot60(self.player_cards[self.agent_names[(player_idx + 1) % 3]] + self.player_cards[self.agent_names[(player_idx + 2) % 3]]) # remain_cards_cp = remain_cards.copy() # normalize(remain_cards_cp, 0, 60) # assert np.all(remain_cards_cp == remain_cards_check) next_cnt = len(self.player_cards[self.agent_names[(player_idx + 1) % len(self.agent_names)]]) next_next_cnt = len(self.player_cards[self.agent_names[(player_idx + 2) % len(self.agent_names)]]) right_prob_state = remain_cards * (next_cnt / (next_cnt + next_next_cnt)) left_prob_state = remain_cards * (next_next_cnt / (next_cnt + next_next_cnt)) prob_state = np.concatenate([right_prob_state, left_prob_state]) return prob_state
def get_curr_handcards(self): return to_char(super().get_curr_handcards())
def run(self): player = self._build_player() context = zmq.Context() c2s_socket = context.socket(zmq.PUSH) c2s_socket.setsockopt(zmq.IDENTITY, self.identity) c2s_socket.set_hwm(10) c2s_socket.connect(self.c2s) s2c_socket = context.socket(zmq.DEALER) s2c_socket.setsockopt(zmq.IDENTITY, self.identity) s2c_socket.connect(self.s2c) player.reset() init_cards = np.arange(21) # init_cards = np.append(init_cards[::4], init_cards[1::4]) player.prepare_manual(init_cards) r, is_over = 0, False while True: all_state, role_id, curr_handcards_value, last_cards_value, last_category = \ player.get_state_all_cards(), player.get_role_ID(), player.get_curr_handcards(), player.get_last_outcards(), player.get_last_outcategory_idx() # after taking the last action, get to this state and get this reward/isOver. # If isOver, get to the next-episode state immediately. # This tuple is not the same as the one put into the memory buffer is_active = (last_cards_value.size == 0) all_state = np.stack([ get_mask( Card.onehot2char(all_state[i * 60:(i + 1) * 60]), action_space, None if is_active else to_char(last_cards_value)).astype( np.float32) for i in range(3) ]).reshape(-1) last_state = get_mask(to_char(last_cards_value), action_space, None).astype(np.float32) if role_id == 2: st = SubState( ACT_TYPE.PASSIVE if last_cards_value.size > 0 else ACT_TYPE.ACTIVE, all_state, to_char(curr_handcards_value), last_cards_value, last_category) if last_cards_value.size > 0: assert last_category > 0 first_st = True while not st.finished: c2s_socket.send(dumps( (self.identity, role_id, st.state, st.all_state, last_state, first_st, st.get_mask(), st.minor_type, st.mode, r, is_over)), copy=False) first_st = False action = loads(s2c_socket.recv(copy=False).bytes) # logger.info('received action {}'.format(action)) # print(action) st.step(action) # print(st.intention) assert st.card_type != -1 r, is_over, category_idx = player.step_manual(st.intention) else: _, r, _ = player.step_auto() is_over = (r != 0) if is_over: # print('{} over with reward {}'.format(self.identity, r)) # logger.info('{} over with reward {}'.format(self.identity, r)) # sys.stdout.flush() player.reset() player.prepare_manual(init_cards)
def play_one_episode(env, func): env.reset() env.prepare() r = 0 stats = [StatCounter() for _ in range(7)] while r == 0: last_cards_value = env.get_last_outcards() last_cards_char = to_char(last_cards_value) last_out_cards = Card.val2onehot60(last_cards_value) last_category_idx = env.get_last_outcategory_idx() curr_cards_char = to_char(env.get_curr_handcards()) is_active = True if last_cards_value.size == 0 else False s = env.get_state_prob() intention, r, category_idx = env.step_auto() if category_idx == 14: continue minor_cards_targets = pick_minor_targets(category_idx, to_char(intention)) if not is_active: if category_idx == Category.QUADRIC.value and category_idx != last_category_idx: passive_decision_input = 1 passive_bomb_input = intention[0] - 3 passive_decision_prob, passive_bomb_prob, _, _, _, _, _ = func( [ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) stats[1].feed( int(passive_bomb_input == np.argmax(passive_bomb_prob))) else: if category_idx == Category.BIGBANG.value: passive_decision_input = 2 passive_decision_prob, _, _, _, _, _, _ = func([ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) else: if category_idx != Category.EMPTY.value: passive_decision_input = 3 # OFFSET_ONE # 1st, Feb - remove relative card output since shift is hard for the network to learn passive_response_input = intention[0] - 3 if passive_response_input < 0: print("something bad happens") passive_response_input = 0 passive_decision_prob, _, passive_response_prob, _, _, _, _ = func( [ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) stats[2].feed( int(passive_response_input == np.argmax( passive_response_prob))) else: passive_decision_input = 0 passive_decision_prob, _, _, _, _, _, _ = func([ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[0].feed( int(passive_decision_input == np.argmax( passive_decision_prob))) else: seq_length = get_seq_length(category_idx, intention) # ACTIVE OFFSET ONE! active_decision_input = category_idx - 1 active_response_input = intention[0] - 3 _, _, _, active_decision_prob, active_response_prob, active_seq_prob, _ = func( [ s.reshape(1, -1), last_out_cards.reshape(1, -1), np.zeros([s.shape[0]]) ]) stats[3].feed( int(active_decision_input == np.argmax(active_decision_prob))) stats[4].feed( int(active_response_input == np.argmax(active_response_prob))) if seq_length is not None: # length offset one seq_length_input = seq_length - 1 stats[5].feed( int(seq_length_input == np.argmax(active_seq_prob))) if minor_cards_targets is not None: main_cards = pick_main_cards(category_idx, to_char(intention)) handcards = curr_cards_char.copy() state = s.copy() for main_card in main_cards: handcards.remove(main_card) cards_onehot = Card.char2onehot60(main_cards) # we must make the order in each 4 batch correct... discard_onehot_from_s_60(state, cards_onehot) is_pair = False minor_type = 0 if category_idx == Category.THREE_TWO.value or category_idx == Category.THREE_TWO_LINE.value: is_pair = True minor_type = 1 for target in minor_cards_targets: target_val = Card.char2value_3_17(target) - 3 _, _, _, _, _, _, minor_response_prob = func([ state.copy().reshape(1, -1), last_out_cards.reshape(1, -1), np.array([minor_type]) ]) stats[6].feed( int(target_val == np.argmax(minor_response_prob))) cards = [target] handcards.remove(target) if is_pair: if target not in handcards: logger.warn('something wrong...') logger.warn('minor', target) logger.warn('main_cards', main_cards) logger.warn('handcards', handcards) else: handcards.remove(target) cards.append(target) # correct for one-hot state cards_onehot = Card.char2onehot60(cards) # print(s.shape) # print(cards_onehot.shape) discard_onehot_from_s_60(state, cards_onehot) return stats
def ccardgroup2char(cg): return [to_char(int(c) + 3) for c in cg.cards]
def _populate_exp(self): """ populate a transition by epsilon-greedy""" old_s = self._current_ob comb_mask = self._comb_mask if not self._comb_mask and self._fine_mask is not None: fine_mask = self._fine_mask if self._fine_mask.shape[0] == max(self.num_actions[0], self.num_actions[1]) \ else np.pad(self._fine_mask, (0, max(self.num_actions[0], self.num_actions[1]) - self._fine_mask.shape[0]), 'constant', constant_values=(0, 0)) else: fine_mask = np.ones( [max(self.num_actions[0], self.num_actions[1])], dtype=np.bool) last_cards_value = self.player.get_last_outcards() if self.rng.rand() <= self.exploration: if not self._comb_mask and self._fine_mask is not None: q_values = np.random.rand(self.num_actions[1]) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) # print(q_values) # print(act) else: act = self.rng.choice( range(self.num_actions[0 if comb_mask else 1])) else: q_values = self.predictor(old_s[None, :, :, :], np.array([comb_mask]), np.array([fine_mask]))[0][0] if not self._comb_mask and self._fine_mask is not None: q_values = q_values[:self.num_actions[1]] assert np.all(q_values[np.where(np.logical_not( self._fine_mask))[0]] < -100) q_values[np.where(np.logical_not(self._fine_mask))[0]] = np.nan act = np.nanargmax(q_values) assert act < self.num_actions[0 if comb_mask else 1] # print(q_values) # print(act) # clamp action to valid range act = min(act, self.num_actions[0 if comb_mask else 1] - 1) if comb_mask: reward = 0 isOver = False else: if last_cards_value.size > 0: if act > 0: if not CardGroup.to_cardgroup( self._action_space[act]).bigger_than( CardGroup.to_cardgroup( to_char(last_cards_value))): print('warning, some error happened') # print(to_char(self.player.get_curr_handcards())) reward, isOver, _ = self.player.step_manual( to_value(self._action_space[act])) # print(self._action_space[act]) # step for AI while not isOver and self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) # if landlord negate the reward if ROLE_ID_TO_TRAIN == 2: reward = -reward self._current_game_score.feed(reward) if isOver: # print('lord wins' if reward > 0 else 'farmer wins') self._player_scores.feed(self._current_game_score.sum) # print(self._current_game_score.sum) while True: self.player.reset() # init_cards = np.arange(36) # self.player.prepare_manual(init_cards) self.player.prepare() self._comb_mask = True early_stop = False while self.player.get_role_ID() != ROLE_ID_TO_TRAIN: _, reward, _ = self.player.step_auto() isOver = (reward != 0) if isOver: print('prestart ends too early! now resetting env') early_stop = True break if early_stop: continue self._current_ob, self._action_space = self.get_state_and_action_spaces( ) break self._current_game_score.reset() else: self._comb_mask = not self._comb_mask self._current_ob, self._action_space = self.get_state_and_action_spaces( act if not self._comb_mask else None) self.mem.append( Experience(old_s, act, reward, isOver, comb_mask, fine_mask))