class FeatureStrengthOffline: step_map = { 0: 0, 3: 1, 4: 2, 5: 3 } def __init__(self): self.cfc = CardFeatureCompute() self.cfvc = CardFeatureVectorCompute() self.feature_num = self.cfc.feature_num self.step_num = 4 self.feature_prediction_map = np.zeros([self.step_num, self.feature_num]) self.feature_frequency_map = np.zeros([self.step_num, self.feature_num]) # consists of 2 elements: 1. a list of 4 step vectors 2. round result [0/1] self.info_list = [] # consists of 4 step vectors self.step_list = [] def raw_feed(self, hole, community, step): # step represent the number of community card: 0,3,4,5 assert (step in [0, 3, 4, 5]) hole_list = [] community_list = [] step_true = self.step_map[step] for card in hole: hole_list.append(Card.from_str(card)) for card in community: community_list.append(Card.from_str(card)) feature_vector = self.get_step_feature_vector(hole_list, community_list, step_true) self.step_list.append(feature_vector) def feed_result(self, result): assert(len(self.step_list) < 5) self.info_list.append([self.step_list, result]) self.step_list = [] def get_step_feature_dict(self, hole, community, step): assert((step >= 0) and (step < 4)) feature_dict = self.cfc.fetch_feature(hole, community) return feature_dict def get_step_feature_vector(self, hole, community, step): assert((step >= 0) and (step < 4)) feature_vector = np.array(self.cfvc.fetch_feature(hole, community)) return feature_vector def feed_bunch_feature_prob_map(self, round_info_list): for info in round_info_list: self.feed_round_feature_prob_map(info[0], info[1]) def feed_round_feature_prob_map(self, feature_vector_list, result): assert(len(feature_vector_list) < 5) for i in range(len(feature_vector_list)): self.feed_step_feature_prob_map(feature_vector_list[i], result, i) def feed_step_feature_prob_map(self, feature_vector, result, step): vfunc_f = np.vectorize(self.check_if_present) vfunc_p = np.vectorize(self.compare_predict) self.feature_frequency_map[step] = np.add(self.feature_frequency_map[step], vfunc_f(feature_vector)) self.feature_prediction_map[step] = np.add(self.feature_prediction_map[step], vfunc_p(feature_vector, result)) def feed_self_feature_prob_map(self): self.feed_bunch_feature_prob_map(self.info_list) def output_feature_map(self): final_feature_map = [] for i in range(self.step_num): f = self.feature_frequency_map[i] p = self.feature_prediction_map[i] feature_step_map = np.true_divide(p, f, out=np.zeros_like(p), where=f != 0) final_feature_map.append(feature_step_map) return final_feature_map def output_weight_suggest(self): final_weight_suggest = [] normalised_feature_p = self.output_feature_map() for v in normalised_feature_p: sum = np.sum(v) if sum == 0: final_weight_suggest.append(np.zeros(self.feature_num)) else: normalised = np.true_divide(v, sum) final_weight_suggest.append(normalised) return final_weight_suggest @staticmethod def check_if_present(predict): return predict > 0.5 @staticmethod def compare_predict(predict, result): return result if predict > 0.5 else 0
class RTPlayer(BasePokerPlayer): def __init__(self): super(BasePokerPlayer, self).__init__() self.pp = pprint.PrettyPrinter(indent=2) ## basic records self.street_map = {'preflop': 0, 'flop': 1, 'river': 2, 'turn': 3} self.rev_street_map = {0: 'preflop', 1: 'flop', 2: 'river', 3: 'turn'} self.nParams = 4 # self.learn_factor = [0.01, 0.01, 0.01, 0.01] self.learn_factor = 0 self.accumulate = 0 ## update every game self.initial_stack = 0 self.seat_id = 0 self.max_round = 0 self.small_blind_amount = 0 ## params used in training part ## update at the start of every round self.hole_card = [] self.stack_record = [[0] * 2] * 2 self.total_gain = [0, 0] self.bet_has_placed = [0, 0] self.estimated_step_rewards = [] self.ifRandom = False #update every street self.feature_vector = np.ones(self.nParams + 1) self.q_suggest = {'raise': 0, 'call': 0, 'fold': 0} self.street_idx = 0 self.probs = {'raise': 0.5, 'call': 0.4, 'fold': 0.1} #TODO: how to initialize theta # self.step_theta = Trained_other_feature().get_weights() #[self.theta_single_step(self.nParams + 1),\ # self.theta_single_step(self.nParams + 1),\ # self.theta_single_step(self.nParams + 1),\ # self.theta_single_step(self.nParams + 1)] # helper to compute the strength self.cfvc = CardFeatureVectorCompute() self.thf = Trained_hand_feature() self.eps = 0 self.game_count = 0 ## a list to keep record of all results self.results = [] self.estimated_step_rewards = [] def declare_action(self, valid_actions, hole_card, round_state): # check if raise 4 times alr, cannot raise any more # flag is true -- still can raise, otherwise cannot flag = True current_round = round_state['action_histories'][round_state['street']] uuid1 = round_state['seats'][0]['uuid'] uuid2 = round_state['seats'][1]['uuid'] # raise count for current round raiseCount = collections.defaultdict(int) for action_details in current_round: if action_details['action'] is 'RAISE' or 'BIGBLIND': # Big blind is also considered as 'RAISE' raiseCount[action_details['uuid']] += 1 if raiseCount[uuid1] >= 4 or raiseCount[uuid2] >= 4: flag = False # read feature my_id = self.seat_id opp_id = 1 - my_id card_feature = self.cfvc.fetch_feature( Card_util.gen_cards(self.hole_card), Card_util.gen_cards(round_state['community_card'])) card_feature = self.get_transferred_vec(card_feature) card_strength = np.dot(card_feature, self.thf.get_strength(self.street_idx)) my_stack = round_state['seats'][my_id]['stack'] opp_stack = round_state['seats'][opp_id]['stack'] my_bet = self.stack_record[my_id][1] - my_stack opp_bet = self.stack_record[opp_id][1] - opp_stack my_total_gain = self.total_gain[my_id] # get the feature vector for every possible action feature_vec = self.phi(card_strength, my_stack, opp_stack, my_bet, opp_bet, my_total_gain) self.feature_vector = feature_vec # value for taking different action q_raise = np.dot(self.step_theta[self.street_idx]['raise'], feature_vec) q_call = np.dot(self.step_theta[self.street_idx]['call'], feature_vec) q_fold = np.dot(self.step_theta[self.street_idx]['fold'], feature_vec) # print('raise %10.6f, call %10.6f, fold %10.6f' % (q_raise, q_call, q_fold)) self.q_suggest['raise'] = q_raise self.q_suggest['call'] = q_call self.q_suggest['fold'] = q_fold sig_raise = self.sigmoid(q_raise) sig_call = self.sigmoid(q_call) sig_fold = self.sigmoid(q_fold) sum_sig = sig_raise + sig_call + sig_fold self.probs['raise'] = sig_raise / sum_sig self.probs['call'] = sig_call / sum_sig self.probs['fold'] = sig_fold # choose action next_action, probability = self.action_select_helper( valid_actions, flag) # print('next action: %s' % next_action) expected_reward = self.q_suggest[next_action] self.estimated_step_rewards.append([ next_action, expected_reward, probability, self.street_idx, self.feature_vector ]) return next_action # action returned here is sent to the poker engine def receive_game_start_message(self, game_info): # initialise stack record when enters the first round self.initial_stack = game_info['rule']['initial_stack'] self.max_round = game_info['rule']['max_round'] self.small_blind_amount = game_info['rule']['small_blind_amount'] if game_info['seats'][0]['uuid'] is self.uuid: self.seat_id = 0 else: self.seat_id = 1 self.stack_record = [[self.initial_stack] * 2, [self.initial_stack] * 2] self.game_count += 1 # self.learn_factor = 0 if self.game_count > 10 else 0.01 self.learn_factor = 0.01 # self.learn_factor = np.floor(10 / self.game_count) / float(100) def receive_round_start_message(self, round_count, hole_card, seats): self.estimated_step_rewards = [] self.total_gain = [ self.stack_record[0][1] - self.initial_stack, self.stack_record[1][1] - self.initial_stack ] self.bet_has_placed = [0, 0] self.hole_card = hole_card r = np.random.rand() self.eps = self.epsilon(round_count) if r < self.eps: self.ifRandom = True else: self.ifRandom = False def receive_street_start_message(self, street, round_state): current_street = self.street_map[round_state['street']] self.street_idx = current_street def receive_game_update_message(self, action, round_state): pass def receive_round_result_message(self, winners, hand_info, round_state): # start training at the end of every round self.stack_record = [[self.stack_record[0][1], round_state['seats'][0]['stack']], \ [self.stack_record[1][1], round_state['seats'][1]['stack']]] true_reward = [self.stack_record[0][1] - self.stack_record[0][0], \ self.stack_record[1][1] - self.stack_record[1][0]][self.seat_id] # backtrack every step self.estimated_step_rewards = self.estimated_step_rewards[::-1] prob = 1 for record in self.estimated_step_rewards: action = record[0] expected_reward = record[1] probability = record[2] step_idx = record[3] feature_vec = record[4] # print('action takes: %s, probability: %6.4f' % (action, prob)) # update the theta prob *= probability delta = np.multiply(feature_vec, (true_reward - expected_reward) * prob * self.learn_factor) # print('true reward: %6.3f, expected reward: %6.3f' % (true_reward, expected_reward)) self.step_theta[step_idx][action] = np.add( self.step_theta[step_idx][action], delta) # self.pp.pprint(self.step_theta) self.accumulate += true_reward self.results.append(self.accumulate) # self.results.append(true_reward) def action_select_helper(self, valid_actions, flag): valid_acts = list(map(lambda x: x['action'], valid_actions)) # remove raise if raise is not allowed if not flag and 'raise' in valid_acts: valid_acts.remove('raise') action_to_choose = {x: self.q_suggest[x] for x in valid_acts} num_valid = len(valid_acts) assert (num_valid > 0) max_action = max(action_to_choose, key=action_to_choose.get) if self.ifRandom: r = np.random.rand() action = '' if r < 0.5: action = 'call' elif r < 0.9 and num_valid == 3: action = 'raise' else: # print('here') action = 'fold' return action, self.probs[action] else: return max_action, self.probs[max_action] def theta_single_step(self, length): return { 'raise': np.zeros(length), 'call': np.zeros(length), 'fold': np.zeros(length) } def phi(self, hand_strength, my_stack, opp_stack, my_bet, opp_bet, my_total_gain): return np.array([ 1, hand_strength, # self.diff_normal(my_bet, opp_bet), (my_bet - opp_bet) / float(10), self.diff_normal(my_stack, opp_stack), self.diff_normal(my_total_gain, -my_total_gain) ]) def sigmoid(self, x): return float(1) / (1 + np.exp(-x)) def diff_normal(self, x, y): if y == 0: return 1 if x > 0 else -1 return self.sigmoid(float(x - y) / np.abs(y)) def epsilon(self, round_count): # self.eps = float(1) / round_count # self.eps = float(1) / ((self.game_count - 1) * self.max_round + round_count) # self.eps = 0.1 + 0.9 * float(1) / round_count self.eps = 0.1 return self.eps def get_result(self): #i = 0 #avg = [] #while i <= len(self.results) - part_size: # sum = 0 # for j in range(part_size): # sum += self.results[i + j] # avg.append(float(sum) / part_size) # i += part_size #return avg return self.results def get_transferred_vec(self, vec): vfunc_f = np.vectorize(self.zero_to_minus_one) return vfunc_f(vec) @staticmethod def zero_to_minus_one(a): return -1 if a < 0.5 else 1
def declare_action(self, valid_actions, hole_card, round_state): # check if raise 4 times alr, cannot raise any more # flag is true -- still can raise, otherwise cannot flag = True current_round = round_state['action_histories'][round_state['street']] uuid1 = round_state['seats'][0]['uuid'] uuid2 = round_state['seats'][1]['uuid'] # raise count for current round raiseCount = collections.defaultdict(int) for action_details in current_round: if action_details['action'] is 'RAISE' or 'BIGBLIND': # Big blind is also considered as 'RAISE' raiseCount[action_details['uuid']] += 1 if raiseCount[uuid1] >= 4 or raiseCount[uuid2] >= 4: flag = False cfvc = CardFeatureVectorCompute() thf = Trained_hand_feature() card_feature = cfvc.fetch_feature(Card_util.gen_cards(hole_card), Card_util.gen_cards(round_state['community_card'])) card_strength = np.dot(card_feature, thf.get_strength(self.street_map[round_state['street']])) # def act(self, theta, card_strength, isMe, my_stack, opponent_stack, curr_action, epsilon): # feature vector for different action isMe = True my_stack = round_state['seats'][self.seat_id]['stack'] opponent_stack = round_state['seats'][1-self.seat_id]['stack'] # get the feature vector for every possible action phiRAISE = self.phi(card_strength, isMe, my_stack, opponent_stack, 'raise') phiCALL = self.phi(card_strength, isMe, my_stack, opponent_stack, 'call') phiFOLD = self.phi(card_strength, isMe, my_stack, opponent_stack, 'fold') # value for taking different action qRAISE = self.evalModel(self.theta, phiRAISE) qCALL = self.evalModel(self.theta, phiRAISE) qFOLD = self.evalModel(self.theta, phiFOLD) # choose the action with highest value as the next action # with same quality, choose in order 'raise' > 'call' > 'fold' next_action = '' if qRAISE >= np.amax([qCALL, qFOLD]): next_action = 'raise' elif qCALL >= np.amax([qRAISE, qFOLD]): next_action = 'call' else: next_action = 'fold' # actions in valid_actions other than the current 'next_action' remain_actions = [] for act in valid_actions: if act['action'] is not next_action: remain_actions.append(act['action']) assert len(remain_actions) in [1, 2] if np.random.rand() < self.epsilon(round_state['round_count'])/2 \ or (flag is False and next_action is 'raise') \ or (len(valid_actions) is 2 and next_action is 'raise'): # Condition fro determining whether the next action should be randomly chosen # 1. random_number < epsilon/2 # 2. next_action is 'raise' but raise num alr 4, cannot raise anymore # 3. next_action not in valid_action (there are only two kind of valid_action set: with/withou 'raise') # if any of this is satisfied, next_action will be randomly chosen from remain_actions if len(remain_actions) is 1: # only remain 1 action, choose this by default next_action = remain_actions[0] else : # remain 2 actions, randomly choose from these two if np.random.rand() < 0.5: next_action = remain_actions[0] else: next_action = remain_actions[1] # next_action is finalised, store the 'q' and 'phi' if next_action is 'raise': prob = qRAISE / (qRAISE + qCALL + qFOLD) self.estimated_step_rewards.append([next_action, qRAISE, phiRAISE, prob]) elif next_action is 'call': prob = qCALL / (qRAISE + qCALL + qFOLD) self.estimated_step_rewards.append([next_action, qCALL, phiCALL, prob]) else: prob = qFOLD / (qRAISE + qCALL + qFOLD) self.estimated_step_rewards.append([next_action, qFOLD, phiFOLD, prob]) # print(next_action) return next_action # action returned here is sent to the poker engine