def validate_matches(match_list): """ Checks if the match data for each element of match_list is valid. Args: match_list (list(match)): list of match data to validate """ match_count = 0 for match in match_list: match_count += 1 print("Match {}".format(match_count)) experiences = mp.process_match(match, DraftState.BLUE_TEAM) experiences = mp.process_match(match, DraftState.RED_TEAM) return None
def fill_buffer(self, data, buf): for match in data: for team in self.teams: experiences = mp.process_match(match, team) # remove null actions (usually missing bans) for exp in experiences: _,act,_,_ = exp cid,pos = act if(cid): buf.store([exp])
def score_match(sess, Qnet, match, team): """ Generates an estimated performance score for a team using a specified Qnetwork. Args: sess (tensorflow Session): TF Session to run model in Qnet (qNetwork): tensorflow q network used to score draft match (dict): match dictionary with pick and ban data team (DraftState.BLUE_TEAM or DraftState.RED_TEAM): team perspective that is being scored Returns: score (float): estimated value of picks made in the draft submitted by team for this match """ score = 0. actions = [] states = [] secondary_inputs = [] experiences = mp.process_match(match, team) for exp in experiences: start, (cid, pos), _, _ = exp if cid is None: # Ignore missing bans (if present) continue actions.append(start.get_action(cid, pos)) states.append(start.format_state()) secondary_inputs.append(start.format_secondary_inputs()) # Feed states forward and get scores for submitted actions predicted_Q = sess.run(Qnet.outQ, feed_dict={ Qnet.input: np.stack(states, axis=0), Qnet.secondary_input: np.stack(secondary_inputs, axis=0) }) assert len(actions) == predicted_Q.shape[ 0], "Number of actions doesn't match number of Q estimates!" for i in range(len(actions)): score += predicted_Q[i, actions[i]] return score
DraftState.BLUE_TEAM: [0, 1, 4, 6, 8], DraftState.RED_TEAM: [0, 1, 3, 6] } targets = [10, 10, 10, 9, 8, 7, 6, 6, 6, 5] for match in matches: # if(specific_team): # team = DraftState.RED_TEAM if match["red_team"]==specific_team else DraftState.BLUE_TEAM # else: # team = DraftState.RED_TEAM if match["winner"]==1 else DraftState.BLUE_TEAM # teams = [DraftState.BLUE_TEAM, DraftState.RED_TEAM] teams = [ DraftState.RED_TEAM if match["winner"] == 1 else DraftState.BLUE_TEAM ] for team in teams: experiences = mp.process_match(match, team, augment_data=False) print("") print("Match: {:2} {:6} vs {:6} winner: {:2}".format( count, match["blue_team"], match["red_team"], match["winner"])) for pick_count, exp in enumerate(experiences): print(" === ") print(" Match {}, Pick {}".format(count, pick_count)) print(" === ") state, act, rew, next_state = exp cid, pos = act if cid == None: continue predicted_q_values = model.predict([state]) predicted_q_values = predicted_q_values[0, :]
#plt.ylim([0,2]) fig_name = "tmp/loss_figures/annuled_rate/loss_E{}_run_{}.pdf".format( n_epoch, i + 1) print("Loss figure saved in:{}".format(fig_name), flush=True) fig.savefig(fig_name) fig = plt.figure() plt.plot(x, summaries["train_acc"], x, summaries["val_acc"]) fig_name = "tmp/acc_figs/acc_E{}_run_{}.pdf".format(n_epoch, i + 1) print("Loss figure saved in:{}".format(fig_name), flush=True) fig.savefig(fig_name) # Look at predicted Q values for states in a randomly drawn match match = random.sample(training_matches, 1)[0] team = DraftState.RED_TEAM if match["winner"] == 1 else DraftState.BLUE_TEAM experiences = mp.process_match(match, team) count = 0 # x labels for q val plots xticks = [] xtick_locs = [] for a in range(state.num_actions): cid, pos = state.format_action(a) if cid not in xticks: xticks.append(cid) xtick_locs.append(a) xtick_labels = [cinfo.champion_name_from_id(cid)[:6] for cid in xticks] tf.reset_default_graph() path_to_model = "tmp/ddqn_model_E45" #"tmp/model_E{}".format(n_epoch) model = QNetInferenceModel(name="infer", path=path_to_model) for exp in experiences:
def validate_model(sess, validation_data, online_net, target_net): """ Validates given model by computing loss and absolute accuracy for validation data using current Qnet estimates. Args: sess (tensorflow Session): TF Session to run model in validation_data (list(dict)): list of matches to validate against online_net (qNetwork): "live" Q-network to be validated target_net (qNetwork): target Q-network used to generate target values Returns: stats (tuple(float)): list of statistical measures of performance. stats = (loss,acc) """ val_replay = er.ExperienceBuffer(10 * len(validation_data)) for match in validation_data: # Loss is only computed for winning side of drafts team = DraftState.RED_TEAM if match[ "winner"] == 1 else DraftState.BLUE_TEAM # Process match into individual experiences experiences = mp.process_match(match, team) for exp in experiences: _, act, _, _ = exp (cid, pos) = act if cid is None: # Skip null actions such as missing/skipped bans continue val_replay.store([exp]) n_experiences = val_replay.get_buffer_size() val_experiences = val_replay.sample(n_experiences) state, _, _, _ = val_experiences[0] val_states = np.zeros((n_experiences, ) + state.format_state().shape) val_secondary_inputs = np.zeros((n_experiences, ) + state.format_secondary_inputs().shape) val_actions = np.zeros((n_experiences, )) val_targets = np.zeros((n_experiences, )) for n in range(n_experiences): start, act, rew, finish = val_experiences[n] val_states[n, :, :] = start.format_state() val_secondary_inputs[n, :] = start.format_secondary_inputs() (cid, pos) = act val_actions[n] = start.get_action(cid, pos) state_code = finish.evaluate() if (state_code == DraftState.DRAFT_COMPLETE or state_code in DraftState.invalid_states): # Action moves to terminal state val_targets[n] = rew else: # Each row in predictedQ gives estimated Q(s',a) values for each possible action for the input state s'. predicted_Q = sess.run(target_net.outQ, feed_dict={ target_net.input: [finish.format_state()], target_net.secondary_input: [finish.format_secondary_inputs()] }) # To get max_{a} Q(s',a) values take max along *rows* of predictedQ. max_Q = np.max(predicted_Q, axis=1)[0] val_targets[n] = (rew + online_net.discount_factor * max_Q) loss, pred_actions = sess.run( [online_net.loss, online_net.prediction], feed_dict={ online_net.input: val_states, online_net.secondary_input: val_secondary_inputs, online_net.actions: val_actions, online_net.target: val_targets }) accurate_predictions = 0. for match in validation_data: actions = [] states = [] blue_score = score_match(sess, online_net, match, DraftState.BLUE_TEAM) red_score = score_match(sess, online_net, match, DraftState.RED_TEAM) predicted_winner = DraftState.BLUE_TEAM if blue_score >= red_score else DraftState.RED_TEAM match_winner = DraftState.RED_TEAM if match[ "winner"] == 1 else DraftState.BLUE_TEAM if predicted_winner == match_winner: accurate_predictions += 1 val_accuracy = accurate_predictions / len(validation_data) return (loss, val_accuracy)
def train_network(online_net, target_net, training_matches, validation_matches, train_epochs, batch_size, buffer_size, dampen_states=False, load_model=False, verbose=False): """ Args: online_net (qNetwork): "live" Q-network to be trained. target_net (qNetwork): target Q-network used to generate target values for the online network training_matches (list(match)): list of matches to be trained on validation_matches (list(match)): list of matches to validate model against train_epochs (int): number of times to learn on given data batch_size (int): size of each training set sampled from the replay buffer which will be used to update Qnet at a time buffer_size (int): size of replay buffer used dampen_states (bool): flag for running dampening routine on model load_model (bool): flag to reload existing model verbose (bool): flag for enhanced output Returns: (loss,validation_accuracy) tuple Trains the Q-network Qnet in batches using experience replays. """ num_episodes = len(training_matches) if (verbose): print("***") print("Beginning training..") print(" train_epochs: {}".format(train_epochs)) print(" num_episodes: {}".format(num_episodes)) print(" batch_size: {}".format(batch_size)) print(" buffer_size: {}".format(buffer_size)) if (dampen_states): print(" ********************************") print(" WARNING: BEGINNING DAMPENING CYCLES") print( " THIS SHOULD ONLY BE USED TO REDUCE VALUATION FOR OLDER METAS" ) print(" ********************************") time.sleep(2.) # Hyperparameter used in updating target network # Some notable values: # tau = 1.e-3 -> used in original paper # tau = 0.5 -> average DDQN # tau = 1.0 -> copy online -> target tau = 1. target_update_frequency = 10000 # How often to update target network. Should only be used with tau = 1. stash_model = True # Flag for stashing a copy of the model model_stash_interval = 10 # Stashes a copy of the model this often # Number of steps to take before training. Allows buffer to partially fill. # Must be at least batch_size to avoid error when sampling from experience replay pre_training_steps = 10 * batch_size assert (pre_training_steps <= buffer_size), "Replay not large enough for pre-training!" assert (pre_training_steps >= batch_size), "Buffer not allowed to fill enough before sampling!" # Number of steps to force learner to observe submitted actions, rather than submit its own actions observations = 2000 epsilon = 0.5 # Initial probability of letting the learner submit its own action eps_decay_rate = 1. / (25 * 20 * len(training_matches) ) # Rate at which epsilon decays per submission # Number of steps to take between training update_freq = 1 # There are 10 submissions per match per side overwrite_initial_lr = 2.0e-5 # Overwrite default lr for network lr_decay_freq = 5 # Decay learning rate after a set number of epochs min_learning_rate = 1.e-8 # Minimum learning rate allowed to decay to teams = [DraftState.BLUE_TEAM, DraftState.RED_TEAM] # We can't validate a winner for submissions generated by the learner, # so we will use a winner-less match when getting rewards for such states blank_match = {"winner": None} loss_over_epochs = [] total_steps = 0 # Start training with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if load_model: # Open saved model path_to_model = "tmp/model_E{}.ckpt".format(25) #path_to_model = "model_predictions/play_ins_rd2/model_play_ins_rd2.ckpt" online_net.saver.restore(sess, path_to_model) print("\nCheckpoint loaded from {}".format(path_to_model)) if (overwrite_initial_lr): online_net.learning_rate.assign(overwrite_initial_lr).eval() # Add target init and update operations to graph target_init = create_target_initialization_ops(target_net.name, online_net.name) target_update = create_target_update_ops(target_net.name, online_net.name, tau) # Initialize target network sess.run(target_init) # Get initial loss and accuracy estimates val_loss, val_acc = validate_model(sess, validation_matches, online_net, target_net) loss, train_acc = validate_model(sess, training_matches, online_net, target_net) print(" Initial loss {:.6f}, train {:.6f}, val {:.6f}".format( loss, train_acc, val_acc), flush=True) # Initialize experience replay buffer experience_replay = er.ExperienceBuffer(buffer_size) for i in range(train_epochs): t0 = time.time() if ((i > 0) and (i % lr_decay_freq == 0) and (online_net.learning_rate.eval() >= min_learning_rate)): # Decay learning rate accoring to decay schedule online_net.learning_rate = 0.50 * online_net.learning_rate epoch_steps = 0 bad_state_counts = { "wins": { DraftState.BAN_AND_SUBMISSION: 0, DraftState.DUPLICATE_SUBMISSION: 0, DraftState.DUPLICATE_ROLE: 0, DraftState.INVALID_SUBMISSION: 0, DraftState.TOO_MANY_BANS: 0, DraftState.TOO_MANY_PICKS: 0 }, "loss": { DraftState.BAN_AND_SUBMISSION: 0, DraftState.DUPLICATE_SUBMISSION: 0, DraftState.DUPLICATE_ROLE: 0, DraftState.INVALID_SUBMISSION: 0, DraftState.TOO_MANY_BANS: 0, DraftState.TOO_MANY_PICKS: 0 } } learner_submitted_counts = 0 null_action_count = 0 # Shuffle match presentation order shuffled_matches = random.sample(training_matches, len(training_matches)) # Run model through a self-training iteration, including exploration experiences = self_train(sess, epsilon, n_experiences=20) # If self training results in illegal states, add it to memory if experiences: print("adding {} self-trained experiences..".format( len(experiences))) # for exp in experiences: # _,_,r,_ = exp # print("reward (should be negative) = {}".format(r)) experience_replay.store(experiences) learner_submitted_counts += len(experiences) for match in shuffled_matches: for team in teams: # Process match into individual experiences experiences = mp.process_match(match, team) for experience in experiences: # Some experiences include NULL submissions # The learner isn't allowed to submit NULL picks so skip adding these # to the buffer. state, actual, _, _ = experience (cid, pos) = actual if cid is None: null_action_count += 1 continue # Store original experience experience_replay.store([experience]) if (total_steps >= observations): # Let the network predict the next action, if the action leads # to an invalid state add a negatively reinforced experience to the replay buffer. random_submission = False if (random.random() < epsilon): random_submission = True # Explore state space by submitting random action and checking if that action is legal pred_act = [ random.randint(0, state.num_actions - 1) ] else: # Let model make prediction pred_Q = sess.run( online_net.outQ, feed_dict={ online_net.input: [state.format_state()], online_net.secondary_input: [state.format_secondary_inputs()] }) sorted_actions = pred_Q[0, :].argsort()[::-1] pred_act = sorted_actions[ 0:4] # top 5 actions by model top_action = pred_act[0] for action in pred_act: (cid, pos) = state.format_action(action) pred_state = deepcopy(state) pred_state.update(cid, pos) state_code = pred_state.evaluate() r = get_reward(pred_state, blank_match, (cid, pos), actual) new_experience = (state, (cid, pos), r, pred_state) if (state_code in DraftState.invalid_states): # Prediction moves to illegal state, add negative experience if (team == match["winner"]): bad_state_counts["wins"][ state_code] += 1 else: bad_state_counts["loss"][ state_code] += 1 experience_replay.store([new_experience]) elif (not random_submission and (cid, pos) != actual and action == top_action): # Add memories for "best" legal submission if it was chosen by model and does not duplicate already submitted memory learner_submitted_counts += 1 experience_replay.store([new_experience]) if (epsilon > 0.1): # Reduce epsilon over time epsilon -= eps_decay_rate total_steps += 1 epoch_steps += 1 # Every update_freq steps we train the network using samples from the replay buffer if ((total_steps >= pre_training_steps) and (total_steps % update_freq == 0)): training_batch = experience_replay.sample( batch_size) # Calculate target Q values for each example: # For non-terminal states, targetQ is estimated according to # targetQ = r + gamma*Q'(s',max_a Q(s',a)) # where Q' denotes the target network. # For terminating states the target is computed as # targetQ = r updates = [] for exp in training_batch: startState, _, reward, endingState = exp if (dampen_states): # To dampen states (usually done after major patches or when the meta shifts) # we replace winning rewards with 0. (essentially a loss). reward = 0. state_code = endingState.evaluate() if (state_code == DraftState.DRAFT_COMPLETE or state_code in DraftState.invalid_states): # Action moves to terminal state updates.append(reward) else: # Follwing double DQN paper (https://arxiv.org/abs/1509.06461). # Action is chosen by online network, but the target network is used to evaluate this policy. # Each row in predicted_Q gives estimated Q(s',a) values for all possible actions for the input state s'. predicted_action = sess.run( online_net.prediction, feed_dict={ online_net.input: [endingState.format_state()], online_net.secondary_input: [ endingState. format_secondary_inputs() ] })[0] predicted_Q = sess.run( target_net.outQ, feed_dict={ target_net.input: [endingState.format_state()], target_net.secondary_input: [ endingState. format_secondary_inputs() ] }) updates.append( reward + online_net.discount_factor * predicted_Q[0, predicted_action]) targetQ = np.array(updates) targetQ.shape = (batch_size, ) # Update online net using target Q # Experience replay stores action = (champion_id, position) pairs # these need to be converted into the corresponding index of the input vector to the Qnet actions = np.array([ startState.get_action(*exp[1]) for exp in training_batch ]) _ = sess.run( online_net.update, feed_dict={ online_net.input: np.stack([ exp[0].format_state() for exp in training_batch ], axis=0), online_net.secondary_input: np.stack([ exp[0].format_secondary_inputs() for exp in training_batch ], axis=0), online_net.actions: actions, online_net.target: targetQ, online_net.dropout_keep_prob: 0.5 }) if (total_steps % target_update_frequency == 0): # After the online network has been updated, update target network _ = sess.run(target_update) t1 = time.time() - t0 val_loss, val_acc = validate_model(sess, validation_matches, online_net, target_net) loss, train_acc = validate_model(sess, training_matches, online_net, target_net) loss_over_epochs.append(loss) # Once training is complete, save the updated network out_path = online_net.saver.save( sess, "tmp/model_E{}.ckpt".format(train_epochs)) if (verbose): print( " Finished epoch {}/{}: dt {:.2f}, mem {}, loss {:.6f}, train {:.6f}, val {:.6f}" .format(i + 1, train_epochs, t1, epoch_steps, loss, train_acc, val_acc), flush=True) print(" alpha:{:.4e}".format(online_net.learning_rate.eval())) invalid_action_count = sum([ bad_state_counts["wins"][k] + bad_state_counts["loss"][k] for k in bad_state_counts["wins"] ]) print(" negative memories added = {}".format( invalid_action_count)) print(" bad state distributions:") print(" from wins: {:9} from losses:".format("")) for code in bad_state_counts["wins"]: print(" {:3} -> {:3} counts {:2} {:3} -> {:3} counts". format(code, bad_state_counts["wins"][code], "", code, bad_state_counts["loss"][code])) print(" learner submissions: {}".format( learner_submitted_counts)) print(" model is saved in file: {}".format(out_path)) print("***", flush=True) if (stash_model): if (i > 0 and (i + 1) % model_stash_interval == 0): # Stash a copy of the current model out_path = online_net.saver.save( sess, "tmp/models/model_E{}.ckpt".format(i + 1)) print("Stashed a copy of the current model in {}".format( out_path)) stats = (loss_over_epochs, train_acc) return stats
def validate_model(self, data): """ Validates given model by computing loss and absolute accuracy for data using current Qnet. Args: data (list(dict)): list of matches to validate against Returns: stats (tuple(float)): list of statistical measures of performance. stats = (loss,acc) """ buf = [] for match in data: # Loss is only computed for winning side of drafts team = DraftState.RED_TEAM if match["winner"]==1 else DraftState.BLUE_TEAM # Process match into individual experiences experiences = mp.process_match(match, team) for exp in experiences: _,act,_,_ = exp (cid,pos) = act if cid is None: # Skip null actions such as missing/skipped bans continue buf.append(exp) n_exp = len(buf) targets = [] for exp in buf: start,_,reward,end = exp state_code = end.evaluate() if(state_code==DraftState.DRAFT_COMPLETE or state_code in DraftState.invalid_states): # Action moves to terminal state targets.append(reward) else: feed_dict = {self.ddq_net.online_ops["input"]:[end.format_state()], self.ddq_net.online_ops["valid_actions"]:[end.get_valid_actions()]} predicted_action = self.ddq_net.sess.run(self.ddq_net.online_ops["prediction"], feed_dict=feed_dict)[0] feed_dict = {self.ddq_net.target_ops["input"]:[end.format_state()]} predicted_Q = self.ddq_net.sess.run(self.ddq_net.target_ops["outQ"], feed_dict=feed_dict) targets.append(reward + self.ddq_net.discount_factor*predicted_Q[0,predicted_action]) actions = np.array([start.get_action(*exp[1]) for exp in buf]) targets = np.array(targets) feed_dict = {self.ddq_net.online_ops["input"]:np.stack([exp[0].format_state() for exp in buf],axis=0), self.ddq_net.online_ops["actions"]:actions, self.ddq_net.online_ops["target"]:targets, self.ddq_net.online_ops["valid_actions"]:np.stack([exp[0].get_valid_actions() for exp in buf],axis=0)} loss, pred_q = self.ddq_net.sess.run([self.ddq_net.online_ops["loss"], self.ddq_net.online_ops["valid_outQ"]],feed_dict=feed_dict) accurate_predictions = 0 rank_tolerance = 5 for n in range(n_exp): state,act,_,_ = buf[n] submitted_action_id = state.get_action(*act) data = [(a,pred_q[n,a]) for a in range(pred_q.shape[1])] df = pd.DataFrame(data, columns=['act_id','Q']) df.sort_values('Q',ascending=False,inplace=True) df.reset_index(drop=True,inplace=True) df['rank'] = df.index submitted_row = df[df['act_id']==submitted_action_id] rank = submitted_row['rank'].iloc[0] if rank < rank_tolerance: accurate_predictions += 1 accuracy = accurate_predictions/n_exp return (loss, accuracy)
def train_epoch(self): """ Training loop for a single epoch """ # We can't validate a winner for submissions generated by the learner, # so we will use a winner-less match when getting rewards for such states blank_match = {"winner":None} learner_submitted_actions = 0 null_actions = 0 # Shuffle match presentation order shuffled_matches = random.sample(self.training_data, len(self.training_data)) for match in shuffled_matches: for team in self.teams: # Process match into individual experiences experiences = mp.process_match(match, team) for pick_id, experience in enumerate(experiences): # Some experiences include NULL submissions (usually missing bans) # The learner isn't allowed to submit NULL picks so skip adding these # to the buffer. state,actual,_,_ = experience (cid,pos) = actual if cid is None: null_actions += 1 continue # Store original experience self.replay.store([experience]) self.step_count += 1 # Give model feedback on current estimations if(self.step_count > self.observations): # Let the network predict the next action feed_dict = {self.ddq_net.online_ops["input"]:[state.format_state()], self.ddq_net.online_ops["valid_actions"]:[state.get_valid_actions()]} q_vals = self.ddq_net.sess.run(self.ddq_net.online_ops["valid_outQ"], feed_dict=feed_dict) sorted_actions = q_vals[0,:].argsort()[::-1] top_actions = sorted_actions[0:4] if(random.random() < self.epsilon): pred_act = random.sample(list(top_actions), 1) else: # Use model's top prediction pred_act = [sorted_actions[0]] for action in pred_act: (cid,pos) = state.format_action(action) if((cid,pos)!=actual): pred_state = deepcopy(state) pred_state.update(cid,pos) r = get_reward(pred_state, blank_match, (cid,pos), actual) new_experience = (state, (cid,pos), r, pred_state) self.replay.store([new_experience]) learner_submitted_actions += 1 if(self.epsilon > 0.1): # Reduce epsilon over time self.epsilon -= self.eps_decay_rate # Use minibatch sample to update online network if(self.step_count > self.pre_training_steps): self.train_step() if(self.step_count % self.target_update_frequency == 0): # After the online network has been updated, update target network _ = self.ddq_net.sess.run(self.ddq_net.target_ops["target_update"]) # Get training loss, training_acc, and val_acc to return loss, train_acc = self.validate_model(self.training_data) _, val_acc = self.validate_model(self.validation_data) return (loss, train_acc, val_acc)