def RGOT(G,number_of_turns = 1500, number_of_arms = 10, \ arms_behavior = "Bernoulli", policies = ["Epsilon_greedy", "UCB", \ "Epsilon_z_greedy", "UCB_z" , \ "Epsilon_soft_greedy", "UCB_soft", \ "variable_pool"]): # set the policies you want to play arms = create_arms(number_of_arms, arms_behavior, policies) rewards_history = create_reward_history(number_of_turns, policies) tradeoff_history = create_tradeoff_history(number_of_turns, policies) #pool_size = [0]*number_of_turns for policy in policies: arms = initialize_mean_reward(arms, G, rewards_history, tradeoff_history, policy) ## add option to not initialize for t in range(number_of_arms, number_of_turns): for policy in policies: best_arm_so_far = get_best_estimate_arm_index(arms, policy) z = compute_z(G) # may depend on policy arm_to_play, tradeoff = choose_arm_and_tradeoff( t, policy, arms, best_arm_so_far, G, z) x_t = get_reward(arms[arm_to_play]) # reward for the arm played rewards_history[policy][t] = x_t * G[ t] # actual reward you get modified by the greed function tradeoff_history[policy][t] = tradeoff * x_t * G[t] update_arm(arms[arm_to_play], x_t, t, policy) # update the arm performance under this policy return pd.DataFrame(arms), rewards_history, tradeoff_history
def main(kc=AGENT_PARAMS["KC"]): environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) controller = P_controller(environment, AGENT_PARAMS, kc) init_h = TANK_PARAMS["init_level"] * TANK_PARAMS["height"] h = [init_h] z = [AGENT_PARAMS["INIT_POSITION"]] d = [TANK_DIST["nom_flow"]] reward = [] max_time = MAIN_PARAMS["Max_time"] for t in range(max_time): new_z = controller.get_z(h[-1]) z.append(new_z) new_h = environment.get_next_state(z[-1], t) new_reward = get_reward(h[-1] / 10, False) reward.append(new_reward) if TANK_DIST["add"]: new_d = environment.model.dist.flow[t] d.append(new_d) h.append(new_h) if environment.show_rendering: environment.render(z[-1]) if keyboard.is_pressed("ctrl+x"): break _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) ax1.plot(h[:-1], color="peru", label="Tank 1") ax1.set_ylim(0, 10) ax1.set_ylabel("Level") ax1.legend() ax2.plot(z[1:], color="peru", label="Tank 1") ax2.set_ylabel("Valve") ax2.legend() ax2.set_ylim(-0.01, 1.01) ax3.plot(d[:-1], color="peru", label="Tank 1") ax3.set_ylabel("Disturbance") ax3.legend() # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"]) plt.tight_layout() plt.xlabel("Time") plt.show() return np.sum(reward)
def main(): # ============= Initialize variables and objects ===========# environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) z = [] h = [] d = [] # ================= Running episodes =================# state, episode_reward = environment.reset() h_ = np.array([state[0][0][0], state[0][1][0]]) h.append(h_) for t in range(MAIN_PARAMS["MAX_TIME"]): action = agent.act(state[-1]) # get action choice from state z_ = agent.action_choices[ action] # convert action choice into valve position z.append(np.array(z_)) terminated, next_state = environment.get_next_state( z[-1], state[-1], t) # Calculate next state with action reward = get_reward( next_state, terminated) # get reward from transition to next state # Store data episode_reward.append(reward) state.append(next_state) h_ = [] d_ = [] for i in range(agent.n_tanks): d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i]) h_.append(np.array(next_state[i][0])) d.append(d_) h.append(h_) if environment.show_rendering: environment.render(z[-1]) if True in terminated: break if keyboard.is_pressed("ctrl+x"): break if not environment.running: break print(np.sum(episode_reward)) _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) d = np.array(d) h = np.array(h[:-1]) z = np.array(z) h *= 10 ax1.plot(h[:-1, 0], color="peru", label="Tank 1") ax1.plot(h[:-1, 1], color="firebrick", label="Tank 2") ax1.set_ylabel("Level") ax1.legend(loc="upper right") ax1.set_ylim(0, 10) ax2.plot(z[1:, 0], color="peru", label="Tank 1") ax2.plot(z[1:, 1], color="firebrick", label="Tank 2") ax2.legend(loc="upper right") ax2.set_ylabel("Valve") ax2.set_ylim(0, 1.01) ax3.plot(d[:, 0], color="peru", label="Tank 1") ax3.plot(d[:, 1], color="firebrick", label="Tank 2") ax3.set_ylabel("Disturbance") ax3.legend(loc="upper right") # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"]) plt.tight_layout() plt.xlabel("Time") plt.show()
def process_match(match, team, augment_data=True): """ process_match takes an input match and breaks each incremental pick and ban down the draft into experiences (aka "memories"). Args: match (dict): match dictionary with pick and ban data for a single game. team (DraftState.BLUE_TEAM or DraftState.RED_TEAM): The team perspective that is used to process match The selected team has the positions for each pick explicitly included with the experience while the "opposing" team has the assigned positions for its champion picks masked. augment_data (optional) (bool): flag controlling the randomized ordering of submissions that do not affect the draft as a whole Returns: experiences ( list(tuple) ): list of experience tuples. Each experience is of the form (s, a, r, s') where: - s and s' are DraftState states before and after a single action - a is the (stateIndex, position) tuple of selected champion to be banned or picked. position = 0 for submissions by the opposing team - r is the integer reward obtained from submitting the action a process_match() can take the vantage from both sides of the draft to parse for memories. This means we can ultimately sample from both winning drafts (positive reinforcement) and losing drafts (negative reinforcement) when training. """ experiences = [] valid_champ_ids = get_champion_ids() # This section controls data agumentation of the match. Certain submissions in the draft are # submitted consecutively by the same team during the same phase (ie team1 pick0 -> team1 pick1). # Although these submissions were produced in a particular order, from a draft perspective # there is no difference between submissions of the form # team1 pick0 -> team1 pick1 vs team1 pick1 -> team0 pickA # provided that the two picks are from the same phase (both bans or both picks). # Therefore it is possible to augment the order in which these submissions are processed. # Note that we can also augment the banning phase if desired. Although these submissions technically # fall outside of the conditions listed above, in practice bans made in the same phase are # interchangable in order. # Build queue of actions from match reference (augmenting if desired) augments_list = [ ("blue","bans",slice(0,3)), # Blue bans 0,1,2 are augmentable ("blue","bans",slice(3,5)), # Blue bans 3,4 are augmentable ("red","bans",slice(0,3)), ("red","bans",slice(3,5)), ("blue","picks",slice(1,3)), # Blue picks 1,2 are augmentable ("blue","picks",slice(3,5)), # Blue picks 3,4 are augmentable ("red","picks",slice(0,2)) # Red picks 0,1 are augmentable ] if(augment_data): augmented_match = deepcopy(match) # Deepcopy match to avoid side effects for aug in augments_list: (k1,k2,aug_range) = aug count = len(augmented_match[k1][k2][aug_range]) augmented_match[k1][k2][aug_range] = random.sample(augmented_match[k1][k2][aug_range],count) action_queue = build_action_queue(augmented_match) else: action_queue = build_action_queue(match) # Set up draft state draft = DraftState(team,valid_champ_ids) finish_memory = False while action_queue: # Get next pick from deque submission = action_queue.popleft() (submitting_team, pick, position) = submission # There are two conditions under which we want to finalize a memory: # 1. Non-designated team has finished submitting picks for this phase (ie next submission belongs to the designated team) # 2. Draft is complete (no further picks in the draft) if submitting_team == team: if finish_memory: # This is case 1 to store memory r = get_reward(draft, match, a, a) s_next = deepcopy(draft) memory = (s, a, r, s_next) experiences.append(memory) finish_memory = False # Memory starts when upcoming pick belongs to designated team s = deepcopy(draft) # Store action = (champIndex, pos) a = (pick, position) finish_memory = True else: # Mask positions for pick submissions belonging to the non-designated team if position != -1: position = 0 draft.update(pick, position) # Once the queue is empty, store last memory. This is case 2 above. # There is always an outstanding memory at the completion of the draft. # RED_TEAM always gets last pick. Therefore: # if team = BLUE_TEAM -> There is an outstanding memory from last RED_TEAM submission # if team = RED_TEAM -> Memory is open from just before our last submission if(draft.evaluate() == DraftState.DRAFT_COMPLETE): assert finish_memory == True r = get_reward(draft, match, a, a) s_next = deepcopy(draft) memory = (s, a, r, s_next) experiences.append(memory) else: print("{} vs {}".format(match["blue_team"],match["red_team"])) draft.display() print("Error code {}".format(draft.evaluate())) print("Number of experiences {}".format(len(experiences))) for experience in experiences: _,a,_,_ = experience print(a) print("")#raise return experiences
def main(): # ============= Initialize variables and objects ===========# environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS) agent = Agent(AGENT_PARAMS) z = [] h = [] d = [] # ================= Running episodes =================# state, episode_reward = environment.reset() h_ = np.array([state[0][i][0] for i in range(6)]) h.append(h_) for t in range(MAIN_PARAMS["MAX_TIME"]): z_ = agent.act(state[-1]) # get action choice from state z.append(np.array(z_)) terminated, next_state = environment.get_next_state( z[-1], state[-1], t ) # Calculate next state with action reward = get_reward( next_state, terminated ) # get reward from transition to next state # Store data episode_reward.append(reward) state.append(next_state) h_ = [] d_ = [] for i in range(agent.n_tanks): try: d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i]) except AttributeError: d_.append(environment.q_inn[i]) h_.append(np.array(next_state[i][0])) d.append(d_) h.append(h_) if environment.show_rendering: environment.render(z[-1]) if True in terminated: break if keyboard.is_pressed("ctrl+x"): break if not environment.running: break colors = [ "peru", "firebrick", "darkslategray", "darkviolet", "mediumseagreen", "darkcyan", ] h = np.array(h)*10 d = np.array(d) z = np.array(z) for i in range(2): _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False) ax1.plot( h[1:-1, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax1.plot( h[1:-1, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax1.plot( h[1:-1, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax1.set_ylabel("Level") ax1.legend(loc="upper right") ax1.set_ylim(0, 10) ax2.plot( z[1:, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax2.plot( z[1:, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax2.plot( z[1:, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax2.set_ylabel("Valve") ax2.legend(loc="upper right") ax2.set_ylim(0, 1.01) ax3.plot( d[1:-1, 0 + i * 3], color=colors[0 + i * 3], label="Tank {}".format(str(1 + i * 3)), ) ax3.plot( d[1:-1, 1 + i * 3], color=colors[1 + i * 3], label="Tank {}".format(str(2 + i * 3)), ) ax3.plot( d[1:-1, 2 + i * 3], color=colors[2 + i * 3], label="Tank {}".format(str(3 + i * 3)), ) ax3.set_ylabel("Disturbance") ax3.legend(loc="upper right") plt.tight_layout() plt.xlabel("Time") plt.show()
def train_network(online_net, target_net, training_matches, validation_matches, train_epochs, batch_size, buffer_size, dampen_states=False, load_model=False, verbose=False): """ Args: online_net (qNetwork): "live" Q-network to be trained. target_net (qNetwork): target Q-network used to generate target values for the online network training_matches (list(match)): list of matches to be trained on validation_matches (list(match)): list of matches to validate model against train_epochs (int): number of times to learn on given data batch_size (int): size of each training set sampled from the replay buffer which will be used to update Qnet at a time buffer_size (int): size of replay buffer used dampen_states (bool): flag for running dampening routine on model load_model (bool): flag to reload existing model verbose (bool): flag for enhanced output Returns: (loss,validation_accuracy) tuple Trains the Q-network Qnet in batches using experience replays. """ num_episodes = len(training_matches) if (verbose): print("***") print("Beginning training..") print(" train_epochs: {}".format(train_epochs)) print(" num_episodes: {}".format(num_episodes)) print(" batch_size: {}".format(batch_size)) print(" buffer_size: {}".format(buffer_size)) if (dampen_states): print(" ********************************") print(" WARNING: BEGINNING DAMPENING CYCLES") print( " THIS SHOULD ONLY BE USED TO REDUCE VALUATION FOR OLDER METAS" ) print(" ********************************") time.sleep(2.) # Hyperparameter used in updating target network # Some notable values: # tau = 1.e-3 -> used in original paper # tau = 0.5 -> average DDQN # tau = 1.0 -> copy online -> target tau = 1. target_update_frequency = 10000 # How often to update target network. Should only be used with tau = 1. stash_model = True # Flag for stashing a copy of the model model_stash_interval = 10 # Stashes a copy of the model this often # Number of steps to take before training. Allows buffer to partially fill. # Must be at least batch_size to avoid error when sampling from experience replay pre_training_steps = 10 * batch_size assert (pre_training_steps <= buffer_size), "Replay not large enough for pre-training!" assert (pre_training_steps >= batch_size), "Buffer not allowed to fill enough before sampling!" # Number of steps to force learner to observe submitted actions, rather than submit its own actions observations = 2000 epsilon = 0.5 # Initial probability of letting the learner submit its own action eps_decay_rate = 1. / (25 * 20 * len(training_matches) ) # Rate at which epsilon decays per submission # Number of steps to take between training update_freq = 1 # There are 10 submissions per match per side overwrite_initial_lr = 2.0e-5 # Overwrite default lr for network lr_decay_freq = 5 # Decay learning rate after a set number of epochs min_learning_rate = 1.e-8 # Minimum learning rate allowed to decay to teams = [DraftState.BLUE_TEAM, DraftState.RED_TEAM] # We can't validate a winner for submissions generated by the learner, # so we will use a winner-less match when getting rewards for such states blank_match = {"winner": None} loss_over_epochs = [] total_steps = 0 # Start training with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if load_model: # Open saved model path_to_model = "tmp/model_E{}.ckpt".format(25) #path_to_model = "model_predictions/play_ins_rd2/model_play_ins_rd2.ckpt" online_net.saver.restore(sess, path_to_model) print("\nCheckpoint loaded from {}".format(path_to_model)) if (overwrite_initial_lr): online_net.learning_rate.assign(overwrite_initial_lr).eval() # Add target init and update operations to graph target_init = create_target_initialization_ops(target_net.name, online_net.name) target_update = create_target_update_ops(target_net.name, online_net.name, tau) # Initialize target network sess.run(target_init) # Get initial loss and accuracy estimates val_loss, val_acc = validate_model(sess, validation_matches, online_net, target_net) loss, train_acc = validate_model(sess, training_matches, online_net, target_net) print(" Initial loss {:.6f}, train {:.6f}, val {:.6f}".format( loss, train_acc, val_acc), flush=True) # Initialize experience replay buffer experience_replay = er.ExperienceBuffer(buffer_size) for i in range(train_epochs): t0 = time.time() if ((i > 0) and (i % lr_decay_freq == 0) and (online_net.learning_rate.eval() >= min_learning_rate)): # Decay learning rate accoring to decay schedule online_net.learning_rate = 0.50 * online_net.learning_rate epoch_steps = 0 bad_state_counts = { "wins": { DraftState.BAN_AND_SUBMISSION: 0, DraftState.DUPLICATE_SUBMISSION: 0, DraftState.DUPLICATE_ROLE: 0, DraftState.INVALID_SUBMISSION: 0, DraftState.TOO_MANY_BANS: 0, DraftState.TOO_MANY_PICKS: 0 }, "loss": { DraftState.BAN_AND_SUBMISSION: 0, DraftState.DUPLICATE_SUBMISSION: 0, DraftState.DUPLICATE_ROLE: 0, DraftState.INVALID_SUBMISSION: 0, DraftState.TOO_MANY_BANS: 0, DraftState.TOO_MANY_PICKS: 0 } } learner_submitted_counts = 0 null_action_count = 0 # Shuffle match presentation order shuffled_matches = random.sample(training_matches, len(training_matches)) # Run model through a self-training iteration, including exploration experiences = self_train(sess, epsilon, n_experiences=20) # If self training results in illegal states, add it to memory if experiences: print("adding {} self-trained experiences..".format( len(experiences))) # for exp in experiences: # _,_,r,_ = exp # print("reward (should be negative) = {}".format(r)) experience_replay.store(experiences) learner_submitted_counts += len(experiences) for match in shuffled_matches: for team in teams: # Process match into individual experiences experiences = mp.process_match(match, team) for experience in experiences: # Some experiences include NULL submissions # The learner isn't allowed to submit NULL picks so skip adding these # to the buffer. state, actual, _, _ = experience (cid, pos) = actual if cid is None: null_action_count += 1 continue # Store original experience experience_replay.store([experience]) if (total_steps >= observations): # Let the network predict the next action, if the action leads # to an invalid state add a negatively reinforced experience to the replay buffer. random_submission = False if (random.random() < epsilon): random_submission = True # Explore state space by submitting random action and checking if that action is legal pred_act = [ random.randint(0, state.num_actions - 1) ] else: # Let model make prediction pred_Q = sess.run( online_net.outQ, feed_dict={ online_net.input: [state.format_state()], online_net.secondary_input: [state.format_secondary_inputs()] }) sorted_actions = pred_Q[0, :].argsort()[::-1] pred_act = sorted_actions[ 0:4] # top 5 actions by model top_action = pred_act[0] for action in pred_act: (cid, pos) = state.format_action(action) pred_state = deepcopy(state) pred_state.update(cid, pos) state_code = pred_state.evaluate() r = get_reward(pred_state, blank_match, (cid, pos), actual) new_experience = (state, (cid, pos), r, pred_state) if (state_code in DraftState.invalid_states): # Prediction moves to illegal state, add negative experience if (team == match["winner"]): bad_state_counts["wins"][ state_code] += 1 else: bad_state_counts["loss"][ state_code] += 1 experience_replay.store([new_experience]) elif (not random_submission and (cid, pos) != actual and action == top_action): # Add memories for "best" legal submission if it was chosen by model and does not duplicate already submitted memory learner_submitted_counts += 1 experience_replay.store([new_experience]) if (epsilon > 0.1): # Reduce epsilon over time epsilon -= eps_decay_rate total_steps += 1 epoch_steps += 1 # Every update_freq steps we train the network using samples from the replay buffer if ((total_steps >= pre_training_steps) and (total_steps % update_freq == 0)): training_batch = experience_replay.sample( batch_size) # Calculate target Q values for each example: # For non-terminal states, targetQ is estimated according to # targetQ = r + gamma*Q'(s',max_a Q(s',a)) # where Q' denotes the target network. # For terminating states the target is computed as # targetQ = r updates = [] for exp in training_batch: startState, _, reward, endingState = exp if (dampen_states): # To dampen states (usually done after major patches or when the meta shifts) # we replace winning rewards with 0. (essentially a loss). reward = 0. state_code = endingState.evaluate() if (state_code == DraftState.DRAFT_COMPLETE or state_code in DraftState.invalid_states): # Action moves to terminal state updates.append(reward) else: # Follwing double DQN paper (https://arxiv.org/abs/1509.06461). # Action is chosen by online network, but the target network is used to evaluate this policy. # Each row in predicted_Q gives estimated Q(s',a) values for all possible actions for the input state s'. predicted_action = sess.run( online_net.prediction, feed_dict={ online_net.input: [endingState.format_state()], online_net.secondary_input: [ endingState. format_secondary_inputs() ] })[0] predicted_Q = sess.run( target_net.outQ, feed_dict={ target_net.input: [endingState.format_state()], target_net.secondary_input: [ endingState. format_secondary_inputs() ] }) updates.append( reward + online_net.discount_factor * predicted_Q[0, predicted_action]) targetQ = np.array(updates) targetQ.shape = (batch_size, ) # Update online net using target Q # Experience replay stores action = (champion_id, position) pairs # these need to be converted into the corresponding index of the input vector to the Qnet actions = np.array([ startState.get_action(*exp[1]) for exp in training_batch ]) _ = sess.run( online_net.update, feed_dict={ online_net.input: np.stack([ exp[0].format_state() for exp in training_batch ], axis=0), online_net.secondary_input: np.stack([ exp[0].format_secondary_inputs() for exp in training_batch ], axis=0), online_net.actions: actions, online_net.target: targetQ, online_net.dropout_keep_prob: 0.5 }) if (total_steps % target_update_frequency == 0): # After the online network has been updated, update target network _ = sess.run(target_update) t1 = time.time() - t0 val_loss, val_acc = validate_model(sess, validation_matches, online_net, target_net) loss, train_acc = validate_model(sess, training_matches, online_net, target_net) loss_over_epochs.append(loss) # Once training is complete, save the updated network out_path = online_net.saver.save( sess, "tmp/model_E{}.ckpt".format(train_epochs)) if (verbose): print( " Finished epoch {}/{}: dt {:.2f}, mem {}, loss {:.6f}, train {:.6f}, val {:.6f}" .format(i + 1, train_epochs, t1, epoch_steps, loss, train_acc, val_acc), flush=True) print(" alpha:{:.4e}".format(online_net.learning_rate.eval())) invalid_action_count = sum([ bad_state_counts["wins"][k] + bad_state_counts["loss"][k] for k in bad_state_counts["wins"] ]) print(" negative memories added = {}".format( invalid_action_count)) print(" bad state distributions:") print(" from wins: {:9} from losses:".format("")) for code in bad_state_counts["wins"]: print(" {:3} -> {:3} counts {:2} {:3} -> {:3} counts". format(code, bad_state_counts["wins"][code], "", code, bad_state_counts["loss"][code])) print(" learner submissions: {}".format( learner_submitted_counts)) print(" model is saved in file: {}".format(out_path)) print("***", flush=True) if (stash_model): if (i > 0 and (i + 1) % model_stash_interval == 0): # Stash a copy of the current model out_path = online_net.saver.save( sess, "tmp/models/model_E{}.ckpt".format(i + 1)) print("Stashed a copy of the current model in {}".format( out_path)) stats = (loss_over_epochs, train_acc) return stats
def train_epoch(self): """ Training loop for a single epoch """ # We can't validate a winner for submissions generated by the learner, # so we will use a winner-less match when getting rewards for such states blank_match = {"winner":None} learner_submitted_actions = 0 null_actions = 0 # Shuffle match presentation order shuffled_matches = random.sample(self.training_data, len(self.training_data)) for match in shuffled_matches: for team in self.teams: # Process match into individual experiences experiences = mp.process_match(match, team) for pick_id, experience in enumerate(experiences): # Some experiences include NULL submissions (usually missing bans) # The learner isn't allowed to submit NULL picks so skip adding these # to the buffer. state,actual,_,_ = experience (cid,pos) = actual if cid is None: null_actions += 1 continue # Store original experience self.replay.store([experience]) self.step_count += 1 # Give model feedback on current estimations if(self.step_count > self.observations): # Let the network predict the next action feed_dict = {self.ddq_net.online_ops["input"]:[state.format_state()], self.ddq_net.online_ops["valid_actions"]:[state.get_valid_actions()]} q_vals = self.ddq_net.sess.run(self.ddq_net.online_ops["valid_outQ"], feed_dict=feed_dict) sorted_actions = q_vals[0,:].argsort()[::-1] top_actions = sorted_actions[0:4] if(random.random() < self.epsilon): pred_act = random.sample(list(top_actions), 1) else: # Use model's top prediction pred_act = [sorted_actions[0]] for action in pred_act: (cid,pos) = state.format_action(action) if((cid,pos)!=actual): pred_state = deepcopy(state) pred_state.update(cid,pos) r = get_reward(pred_state, blank_match, (cid,pos), actual) new_experience = (state, (cid,pos), r, pred_state) self.replay.store([new_experience]) learner_submitted_actions += 1 if(self.epsilon > 0.1): # Reduce epsilon over time self.epsilon -= self.eps_decay_rate # Use minibatch sample to update online network if(self.step_count > self.pre_training_steps): self.train_step() if(self.step_count % self.target_update_frequency == 0): # After the online network has been updated, update target network _ = self.ddq_net.sess.run(self.ddq_net.target_ops["target_update"]) # Get training loss, training_acc, and val_acc to return loss, train_acc = self.validate_model(self.training_data) _, val_acc = self.validate_model(self.validation_data) return (loss, train_acc, val_acc)
def self_train(sess, explore_prob, n_experiences=1): """ Runs model currently held in TF Session sess through one self training loop. Returns negative memory if model fails to complete draft. Args: sess (tf.Session()): TF Session used to run model. explore_prob (float): Probability that each pick will explore state space by submitting a random action n_experiences (int): Number of experiences desired. Returns: experiences [(s,a,r,s')]: list of expierence tuples from illegal submissions made by either side of draft None if network completes draft without illegal actions """ MAX_DRAFT_ITERATIONS = 100 # Maximum number of drafts to iterate through assert n_experiences > 0, "Number of experiences must be non-negative" valid_champ_ids = cinfo.get_champion_ids() match = {"winner": None} # Blank match for rewards processing # Two states are maintained: one corresponding to the perception of the draft # according to each of the teams. blue_state = DraftState(DraftState.BLUE_TEAM, valid_champ_ids) red_state = DraftState(DraftState.RED_TEAM, valid_champ_ids) # Draft dictionary holds states for each perspective draft = {0: blue_state, 1: red_state} online_pred = tf.get_default_graph().get_tensor_by_name( "online/prediction:0") online_input = tf.get_default_graph().get_tensor_by_name("online/inputs:0") online_secondary_input = tf.get_default_graph().get_tensor_by_name( "online/secondary_inputs:0") experiences = [] successful_draft_count = 0 while (len(experiences) < n_experiences): if (successful_draft_count > MAX_DRAFT_ITERATIONS): break blue_state.reset() red_state.reset() submission_count = 0 while (blue_state.evaluate() != DraftState.DRAFT_COMPLETE and red_state.evaluate() != DraftState.DRAFT_COMPLETE): active_team = get_active_team(submission_count) inactive_team = 0 if active_team else 1 state = draft[active_team] start = deepcopy(state) if (random.random() < explore_prob): # Explore state space by submitting random action pred_act = [random.randint(0, state.num_actions - 1)] else: pred_act = sess.run(online_pred, feed_dict={ online_input: [state.format_state()], online_secondary_input: [state.format_secondary_inputs()] }) action = state.format_action(pred_act[0]) if (state.is_submission_legal(*action)): # Update active state state.update(*action) # Update inactive state, remembering to mask non-bans submitted by opponent (cid, pos) = action inactive_pos = pos if pos == -1 else 0 draft[inactive_team].update(cid, inactive_pos) submission_count += 1 else: bad_state = deepcopy(state) bad_state.update(*action) experiences.append( (start, action, get_reward(bad_state, match, action, None), bad_state)) break successful_draft_count += 1 return experiences