示例#1
0
def RGOT(G,number_of_turns = 1500, number_of_arms = 10, \
    arms_behavior = "Bernoulli", policies = ["Epsilon_greedy", "UCB", \
                         "Epsilon_z_greedy",  "UCB_z" , \
                         "Epsilon_soft_greedy", "UCB_soft", \
                         "variable_pool"]):          # set the policies you want to play
    arms = create_arms(number_of_arms, arms_behavior, policies)
    rewards_history = create_reward_history(number_of_turns, policies)
    tradeoff_history = create_tradeoff_history(number_of_turns, policies)
    #pool_size = [0]*number_of_turns
    for policy in policies:
        arms = initialize_mean_reward(arms, G, rewards_history,
                                      tradeoff_history,
                                      policy)  ## add option to not initialize
    for t in range(number_of_arms, number_of_turns):
        for policy in policies:
            best_arm_so_far = get_best_estimate_arm_index(arms, policy)
            z = compute_z(G)  # may depend on policy
            arm_to_play, tradeoff = choose_arm_and_tradeoff(
                t, policy, arms, best_arm_so_far, G, z)
            x_t = get_reward(arms[arm_to_play])  # reward for the arm played
            rewards_history[policy][t] = x_t * G[
                t]  # actual reward you get modified by the greed function
            tradeoff_history[policy][t] = tradeoff * x_t * G[t]
            update_arm(arms[arm_to_play], x_t, t,
                       policy)  # update the arm performance under this policy
    return pd.DataFrame(arms), rewards_history, tradeoff_history
示例#2
0
def main(kc=AGENT_PARAMS["KC"]):
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    controller = P_controller(environment, AGENT_PARAMS, kc)
    init_h = TANK_PARAMS["init_level"] * TANK_PARAMS["height"]
    h = [init_h]
    z = [AGENT_PARAMS["INIT_POSITION"]]
    d = [TANK_DIST["nom_flow"]]
    reward = []
    max_time = MAIN_PARAMS["Max_time"]
    for t in range(max_time):
        new_z = controller.get_z(h[-1])
        z.append(new_z)
        new_h = environment.get_next_state(z[-1], t)
        new_reward = get_reward(h[-1] / 10, False)
        reward.append(new_reward)

        if TANK_DIST["add"]:
            new_d = environment.model.dist.flow[t]
            d.append(new_d)
        h.append(new_h)

        if environment.show_rendering:
            environment.render(z[-1])

        if keyboard.is_pressed("ctrl+x"):
            break
    _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)

    ax1.plot(h[:-1], color="peru", label="Tank 1")
    ax1.set_ylim(0, 10)
    ax1.set_ylabel("Level")
    ax1.legend()

    ax2.plot(z[1:], color="peru", label="Tank 1")
    ax2.set_ylabel("Valve")
    ax2.legend()
    ax2.set_ylim(-0.01, 1.01)

    ax3.plot(d[:-1], color="peru", label="Tank 1")
    ax3.set_ylabel("Disturbance")
    ax3.legend()

    # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"])
    plt.tight_layout()
    plt.xlabel("Time")
    plt.show()
    return np.sum(reward)
def main():
    # ============= Initialize variables and objects ===========#
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    z = []
    h = []
    d = []
    # ================= Running episodes =================#

    state, episode_reward = environment.reset()
    h_ = np.array([state[0][0][0], state[0][1][0]])
    h.append(h_)
    for t in range(MAIN_PARAMS["MAX_TIME"]):
        action = agent.act(state[-1])  # get action choice from state
        z_ = agent.action_choices[
            action]  # convert action choice into valve position
        z.append(np.array(z_))
        terminated, next_state = environment.get_next_state(
            z[-1], state[-1], t)  # Calculate next state with action
        reward = get_reward(
            next_state, terminated)  # get reward from transition to next state

        # Store data
        episode_reward.append(reward)

        state.append(next_state)
        h_ = []
        d_ = []
        for i in range(agent.n_tanks):
            d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i])
            h_.append(np.array(next_state[i][0]))
        d.append(d_)
        h.append(h_)
        if environment.show_rendering:
            environment.render(z[-1])
        if True in terminated:
            break

        if keyboard.is_pressed("ctrl+x"):
            break

        if not environment.running:
            break
    print(np.sum(episode_reward))

    _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)
    d = np.array(d)
    h = np.array(h[:-1])
    z = np.array(z)
    h *= 10

    ax1.plot(h[:-1, 0], color="peru", label="Tank 1")
    ax1.plot(h[:-1, 1], color="firebrick", label="Tank 2")
    ax1.set_ylabel("Level")
    ax1.legend(loc="upper right")
    ax1.set_ylim(0, 10)

    ax2.plot(z[1:, 0], color="peru", label="Tank 1")
    ax2.plot(z[1:, 1], color="firebrick", label="Tank 2")
    ax2.legend(loc="upper right")
    ax2.set_ylabel("Valve")
    ax2.set_ylim(0, 1.01)

    ax3.plot(d[:, 0], color="peru", label="Tank 1")
    ax3.plot(d[:, 1], color="firebrick", label="Tank 2")
    ax3.set_ylabel("Disturbance")
    ax3.legend(loc="upper right")

    # plt.legend([l1, l2, l3], ["Tank height", "Valve position", "Disturbance"])
    plt.tight_layout()
    plt.xlabel("Time")
    plt.show()
示例#4
0
def process_match(match, team, augment_data=True):
    """
    process_match takes an input match and breaks each incremental pick and ban down the draft into experiences (aka "memories").

    Args:
        match (dict): match dictionary with pick and ban data for a single game.
        team (DraftState.BLUE_TEAM or DraftState.RED_TEAM): The team perspective that is used to process match
            The selected team has the positions for each pick explicitly included with the experience while the
            "opposing" team has the assigned positions for its champion picks masked.
        augment_data (optional) (bool): flag controlling the randomized ordering of submissions that do not affect the draft as a whole
    Returns:
        experiences ( list(tuple) ): list of experience tuples. Each experience is of the form (s, a, r, s') where:
            - s and s' are DraftState states before and after a single action
            - a is the (stateIndex, position) tuple of selected champion to be banned or picked. position = 0 for submissions
                by the opposing team
            - r is the integer reward obtained from submitting the action a

    process_match() can take the vantage from both sides of the draft to parse for memories. This means we can ultimately sample from
    both winning drafts (positive reinforcement) and losing drafts (negative reinforcement) when training.
    """
    experiences = []
    valid_champ_ids = get_champion_ids()

    # This section controls data agumentation of the match. Certain submissions in the draft are
    # submitted consecutively by the same team during the same phase (ie team1 pick0 -> team1 pick1).
    # Although these submissions were produced in a particular order, from a draft perspective
    # there is no difference between submissions of the form
    # team1 pick0 -> team1 pick1 vs team1 pick1 -> team0 pickA
    # provided that the two picks are from the same phase (both bans or both picks).
    # Therefore it is possible to augment the order in which these submissions are processed.

    # Note that we can also augment the banning phase if desired. Although these submissions technically
    # fall outside of the conditions listed above, in practice bans made in the same phase are
    # interchangable in order.

    # Build queue of actions from match reference (augmenting if desired)
    augments_list = [
        ("blue","bans",slice(0,3)), # Blue bans 0,1,2 are augmentable
        ("blue","bans",slice(3,5)), # Blue bans 3,4 are augmentable
        ("red","bans",slice(0,3)),
        ("red","bans",slice(3,5)),
        ("blue","picks",slice(1,3)), # Blue picks 1,2 are augmentable
        ("blue","picks",slice(3,5)), # Blue picks 3,4 are augmentable
        ("red","picks",slice(0,2)) # Red picks 0,1 are augmentable
    ]
    if(augment_data):
        augmented_match = deepcopy(match) # Deepcopy match to avoid side effects
        for aug in augments_list:
            (k1,k2,aug_range) = aug
            count = len(augmented_match[k1][k2][aug_range])
            augmented_match[k1][k2][aug_range] = random.sample(augmented_match[k1][k2][aug_range],count)

        action_queue = build_action_queue(augmented_match)
    else:
        action_queue = build_action_queue(match)

    # Set up draft state
    draft = DraftState(team,valid_champ_ids)

    finish_memory = False
    while action_queue:
        # Get next pick from deque
        submission = action_queue.popleft()
        (submitting_team, pick, position) = submission

        # There are two conditions under which we want to finalize a memory:
        # 1. Non-designated team has finished submitting picks for this phase (ie next submission belongs to the designated team)
        # 2. Draft is complete (no further picks in the draft)
        if submitting_team == team:
            if finish_memory:
                # This is case 1 to store memory
                r = get_reward(draft, match, a, a)
                s_next = deepcopy(draft)
                memory = (s, a, r, s_next)
                experiences.append(memory)
                finish_memory = False
            # Memory starts when upcoming pick belongs to designated team
            s = deepcopy(draft)
            # Store action = (champIndex, pos)
            a = (pick, position)
            finish_memory = True
        else:
            # Mask positions for pick submissions belonging to the non-designated team
            if position != -1:
                position = 0

        draft.update(pick, position)

    # Once the queue is empty, store last memory. This is case 2 above.
    # There is always an outstanding memory at the completion of the draft.
    # RED_TEAM always gets last pick. Therefore:
    #   if team = BLUE_TEAM -> There is an outstanding memory from last RED_TEAM submission
    #   if team = RED_TEAM -> Memory is open from just before our last submission
    if(draft.evaluate() == DraftState.DRAFT_COMPLETE):
        assert finish_memory == True
        r = get_reward(draft, match, a, a)
        s_next = deepcopy(draft)
        memory = (s, a, r, s_next)
        experiences.append(memory)
    else:
        print("{} vs {}".format(match["blue_team"],match["red_team"]))
        draft.display()
        print("Error code {}".format(draft.evaluate()))
        print("Number of experiences {}".format(len(experiences)))
        for experience in experiences:
            _,a,_,_ = experience
            print(a)
        print("")#raise

    return experiences
示例#5
0
def main():
    # ============= Initialize variables and objects ===========#
    environment = Environment(TANK_PARAMS, TANK_DIST, MAIN_PARAMS)
    agent = Agent(AGENT_PARAMS)
    z = []
    h = []
    d = []
    # ================= Running episodes =================#

    state, episode_reward = environment.reset()
    h_ = np.array([state[0][i][0] for i in range(6)])
    h.append(h_)
    for t in range(MAIN_PARAMS["MAX_TIME"]):
        z_ = agent.act(state[-1])  # get action choice from state
        z.append(np.array(z_))
        terminated, next_state = environment.get_next_state(
            z[-1], state[-1], t
        )  # Calculate next state with action
        reward = get_reward(
            next_state, terminated
        )  # get reward from transition to next state

        # Store data
        episode_reward.append(reward)

        state.append(next_state)
        h_ = []
        d_ = []
        for i in range(agent.n_tanks):
            try:
                d_.append(environment.tanks[i].dist.flow[t] + environment.q_inn[i])
            except AttributeError:
                d_.append(environment.q_inn[i])
            h_.append(np.array(next_state[i][0]))
        d.append(d_)
        h.append(h_)
        if environment.show_rendering:
            environment.render(z[-1])
        if True in terminated:
            break

        if keyboard.is_pressed("ctrl+x"):
            break

        if not environment.running:
            break
    colors = [
        "peru",
        "firebrick",
        "darkslategray",
        "darkviolet",
        "mediumseagreen",
        "darkcyan",
    ]
    h = np.array(h)*10
    d = np.array(d)
    z = np.array(z)
    for i in range(2):
        _, (ax1, ax2, ax3) = plt.subplots(3, sharex=False, sharey=False)
        ax1.plot(
            h[1:-1, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax1.plot(
            h[1:-1, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax1.plot(
            h[1:-1, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax1.set_ylabel("Level")
        ax1.legend(loc="upper right")
        ax1.set_ylim(0, 10)

        ax2.plot(
            z[1:, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax2.plot(
            z[1:, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax2.plot(
            z[1:, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax2.set_ylabel("Valve")
        ax2.legend(loc="upper right")
        ax2.set_ylim(0, 1.01)

        ax3.plot(
            d[1:-1, 0 + i * 3],
            color=colors[0 + i * 3],
            label="Tank {}".format(str(1 + i * 3)),
        )
        ax3.plot(
            d[1:-1, 1 + i * 3],
            color=colors[1 + i * 3],
            label="Tank {}".format(str(2 + i * 3)),
        )
        ax3.plot(
            d[1:-1, 2 + i * 3],
            color=colors[2 + i * 3],
            label="Tank {}".format(str(3 + i * 3)),
        )
        ax3.set_ylabel("Disturbance")
        ax3.legend(loc="upper right")

        plt.tight_layout()
        plt.xlabel("Time")
        plt.show()
示例#6
0
def train_network(online_net,
                  target_net,
                  training_matches,
                  validation_matches,
                  train_epochs,
                  batch_size,
                  buffer_size,
                  dampen_states=False,
                  load_model=False,
                  verbose=False):
    """
    Args:
        online_net (qNetwork): "live" Q-network to be trained.
        target_net (qNetwork): target Q-network used to generate target values for the online network
        training_matches (list(match)): list of matches to be trained on
        validation_matches (list(match)): list of matches to validate model against
        train_epochs (int): number of times to learn on given data
        batch_size (int): size of each training set sampled from the replay buffer which will be used to update Qnet at a time
        buffer_size (int): size of replay buffer used
        dampen_states (bool): flag for running dampening routine on model
        load_model (bool): flag to reload existing model
        verbose (bool): flag for enhanced output
    Returns:
        (loss,validation_accuracy) tuple
    Trains the Q-network Qnet in batches using experience replays.
    """
    num_episodes = len(training_matches)
    if (verbose):
        print("***")
        print("Beginning training..")
        print("  train_epochs: {}".format(train_epochs))
        print("  num_episodes: {}".format(num_episodes))
        print("  batch_size: {}".format(batch_size))
        print("  buffer_size: {}".format(buffer_size))
        if (dampen_states):
            print("  ********************************")
            print("  WARNING: BEGINNING DAMPENING CYCLES")
            print(
                "  THIS SHOULD ONLY BE USED TO REDUCE VALUATION FOR OLDER METAS"
            )
            print("  ********************************")
            time.sleep(2.)
    # Hyperparameter used in updating target network
    # Some notable values:
    #  tau = 1.e-3 -> used in original paper
    #  tau = 0.5 -> average DDQN
    #  tau = 1.0 -> copy online -> target
    tau = 1.
    target_update_frequency = 10000  # How often to update target network. Should only be used with tau = 1.
    stash_model = True  # Flag for stashing a copy of the model
    model_stash_interval = 10  # Stashes a copy of the model this often
    # Number of steps to take before training. Allows buffer to partially fill.
    # Must be at least batch_size to avoid error when sampling from experience replay
    pre_training_steps = 10 * batch_size
    assert (pre_training_steps <=
            buffer_size), "Replay not large enough for pre-training!"
    assert (pre_training_steps >=
            batch_size), "Buffer not allowed to fill enough before sampling!"
    # Number of steps to force learner to observe submitted actions, rather than submit its own actions
    observations = 2000
    epsilon = 0.5  # Initial probability of letting the learner submit its own action
    eps_decay_rate = 1. / (25 * 20 * len(training_matches)
                           )  # Rate at which epsilon decays per submission
    # Number of steps to take between training
    update_freq = 1  # There are 10 submissions per match per side
    overwrite_initial_lr = 2.0e-5  # Overwrite default lr for network
    lr_decay_freq = 5  # Decay learning rate after a set number of epochs
    min_learning_rate = 1.e-8  # Minimum learning rate allowed to decay to

    teams = [DraftState.BLUE_TEAM, DraftState.RED_TEAM]
    # We can't validate a winner for submissions generated by the learner,
    # so we will use a winner-less match when getting rewards for such states
    blank_match = {"winner": None}
    loss_over_epochs = []
    total_steps = 0
    # Start training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        if load_model:
            # Open saved model
            path_to_model = "tmp/model_E{}.ckpt".format(25)
            #path_to_model = "model_predictions/play_ins_rd2/model_play_ins_rd2.ckpt"
            online_net.saver.restore(sess, path_to_model)
            print("\nCheckpoint loaded from {}".format(path_to_model))

            if (overwrite_initial_lr):
                online_net.learning_rate.assign(overwrite_initial_lr).eval()

        # Add target init and update operations to graph
        target_init = create_target_initialization_ops(target_net.name,
                                                       online_net.name)
        target_update = create_target_update_ops(target_net.name,
                                                 online_net.name, tau)
        # Initialize target network
        sess.run(target_init)

        # Get initial loss and accuracy estimates
        val_loss, val_acc = validate_model(sess, validation_matches,
                                           online_net, target_net)
        loss, train_acc = validate_model(sess, training_matches, online_net,
                                         target_net)
        print(" Initial loss {:.6f}, train {:.6f}, val {:.6f}".format(
            loss, train_acc, val_acc),
              flush=True)

        # Initialize experience replay buffer
        experience_replay = er.ExperienceBuffer(buffer_size)
        for i in range(train_epochs):
            t0 = time.time()
            if ((i > 0) and (i % lr_decay_freq == 0) and
                (online_net.learning_rate.eval() >= min_learning_rate)):
                # Decay learning rate accoring to decay schedule
                online_net.learning_rate = 0.50 * online_net.learning_rate

            epoch_steps = 0

            bad_state_counts = {
                "wins": {
                    DraftState.BAN_AND_SUBMISSION: 0,
                    DraftState.DUPLICATE_SUBMISSION: 0,
                    DraftState.DUPLICATE_ROLE: 0,
                    DraftState.INVALID_SUBMISSION: 0,
                    DraftState.TOO_MANY_BANS: 0,
                    DraftState.TOO_MANY_PICKS: 0
                },
                "loss": {
                    DraftState.BAN_AND_SUBMISSION: 0,
                    DraftState.DUPLICATE_SUBMISSION: 0,
                    DraftState.DUPLICATE_ROLE: 0,
                    DraftState.INVALID_SUBMISSION: 0,
                    DraftState.TOO_MANY_BANS: 0,
                    DraftState.TOO_MANY_PICKS: 0
                }
            }
            learner_submitted_counts = 0
            null_action_count = 0

            # Shuffle match presentation order
            shuffled_matches = random.sample(training_matches,
                                             len(training_matches))

            # Run model through a self-training iteration, including exploration
            experiences = self_train(sess, epsilon, n_experiences=20)
            # If self training results in illegal states, add it to memory
            if experiences:
                print("adding {} self-trained experiences..".format(
                    len(experiences)))
                #                for exp in experiences:
                #                    _,_,r,_ = exp
                #                    print("reward (should be negative) = {}".format(r))
                experience_replay.store(experiences)
                learner_submitted_counts += len(experiences)

            for match in shuffled_matches:
                for team in teams:
                    # Process match into individual experiences
                    experiences = mp.process_match(match, team)
                    for experience in experiences:
                        # Some experiences include NULL submissions
                        # The learner isn't allowed to submit NULL picks so skip adding these
                        # to the buffer.
                        state, actual, _, _ = experience
                        (cid, pos) = actual
                        if cid is None:
                            null_action_count += 1
                            continue
                        # Store original experience
                        experience_replay.store([experience])
                        if (total_steps >= observations):
                            # Let the network predict the next action, if the action leads
                            # to an invalid state add a negatively reinforced experience to the replay buffer.
                            random_submission = False
                            if (random.random() < epsilon):
                                random_submission = True
                                # Explore state space by submitting random action and checking if that action is legal
                                pred_act = [
                                    random.randint(0, state.num_actions - 1)
                                ]
                            else:
                                # Let model make prediction
                                pred_Q = sess.run(
                                    online_net.outQ,
                                    feed_dict={
                                        online_net.input:
                                        [state.format_state()],
                                        online_net.secondary_input:
                                        [state.format_secondary_inputs()]
                                    })
                                sorted_actions = pred_Q[0, :].argsort()[::-1]
                                pred_act = sorted_actions[
                                    0:4]  # top 5 actions by model

                            top_action = pred_act[0]
                            for action in pred_act:
                                (cid, pos) = state.format_action(action)

                                pred_state = deepcopy(state)
                                pred_state.update(cid, pos)

                                state_code = pred_state.evaluate()
                                r = get_reward(pred_state, blank_match,
                                               (cid, pos), actual)
                                new_experience = (state, (cid, pos), r,
                                                  pred_state)
                                if (state_code in DraftState.invalid_states):
                                    # Prediction moves to illegal state, add negative experience
                                    if (team == match["winner"]):
                                        bad_state_counts["wins"][
                                            state_code] += 1
                                    else:
                                        bad_state_counts["loss"][
                                            state_code] += 1
                                    experience_replay.store([new_experience])
                                elif (not random_submission
                                      and (cid, pos) != actual
                                      and action == top_action):
                                    # Add memories for "best" legal submission if it was chosen by model and does not duplicate already submitted memory
                                    learner_submitted_counts += 1
                                    experience_replay.store([new_experience])

                        if (epsilon > 0.1):
                            # Reduce epsilon over time
                            epsilon -= eps_decay_rate
                        total_steps += 1
                        epoch_steps += 1

                        # Every update_freq steps we train the network using samples from the replay buffer
                        if ((total_steps >= pre_training_steps)
                                and (total_steps % update_freq == 0)):
                            training_batch = experience_replay.sample(
                                batch_size)

                            # Calculate target Q values for each example:
                            # For non-terminal states, targetQ is estimated according to
                            #   targetQ = r + gamma*Q'(s',max_a Q(s',a))
                            # where Q' denotes the target network.
                            # For terminating states the target is computed as
                            #   targetQ = r
                            updates = []
                            for exp in training_batch:
                                startState, _, reward, endingState = exp
                                if (dampen_states):
                                    # To dampen states (usually done after major patches or when the meta shifts)
                                    # we replace winning rewards with 0. (essentially a loss).
                                    reward = 0.
                                state_code = endingState.evaluate()
                                if (state_code == DraftState.DRAFT_COMPLETE
                                        or state_code
                                        in DraftState.invalid_states):
                                    # Action moves to terminal state
                                    updates.append(reward)
                                else:
                                    # Follwing double DQN paper (https://arxiv.org/abs/1509.06461).
                                    #  Action is chosen by online network, but the target network is used to evaluate this policy.
                                    # Each row in predicted_Q gives estimated Q(s',a) values for all possible actions for the input state s'.
                                    predicted_action = sess.run(
                                        online_net.prediction,
                                        feed_dict={
                                            online_net.input:
                                            [endingState.format_state()],
                                            online_net.secondary_input: [
                                                endingState.
                                                format_secondary_inputs()
                                            ]
                                        })[0]
                                    predicted_Q = sess.run(
                                        target_net.outQ,
                                        feed_dict={
                                            target_net.input:
                                            [endingState.format_state()],
                                            target_net.secondary_input: [
                                                endingState.
                                                format_secondary_inputs()
                                            ]
                                        })
                                    updates.append(
                                        reward + online_net.discount_factor *
                                        predicted_Q[0, predicted_action])

                            targetQ = np.array(updates)
                            targetQ.shape = (batch_size, )

                            # Update online net using target Q
                            # Experience replay stores action = (champion_id, position) pairs
                            # these need to be converted into the corresponding index of the input vector to the Qnet
                            actions = np.array([
                                startState.get_action(*exp[1])
                                for exp in training_batch
                            ])
                            _ = sess.run(
                                online_net.update,
                                feed_dict={
                                    online_net.input:
                                    np.stack([
                                        exp[0].format_state()
                                        for exp in training_batch
                                    ],
                                             axis=0),
                                    online_net.secondary_input:
                                    np.stack([
                                        exp[0].format_secondary_inputs()
                                        for exp in training_batch
                                    ],
                                             axis=0),
                                    online_net.actions:
                                    actions,
                                    online_net.target:
                                    targetQ,
                                    online_net.dropout_keep_prob:
                                    0.5
                                })
                            if (total_steps % target_update_frequency == 0):
                                # After the online network has been updated, update target network
                                _ = sess.run(target_update)

            t1 = time.time() - t0
            val_loss, val_acc = validate_model(sess, validation_matches,
                                               online_net, target_net)
            loss, train_acc = validate_model(sess, training_matches,
                                             online_net, target_net)
            loss_over_epochs.append(loss)
            # Once training is complete, save the updated network
            out_path = online_net.saver.save(
                sess, "tmp/model_E{}.ckpt".format(train_epochs))
            if (verbose):
                print(
                    " Finished epoch {}/{}: dt {:.2f}, mem {}, loss {:.6f}, train {:.6f}, val {:.6f}"
                    .format(i + 1, train_epochs, t1, epoch_steps, loss,
                            train_acc, val_acc),
                    flush=True)
                print("  alpha:{:.4e}".format(online_net.learning_rate.eval()))
                invalid_action_count = sum([
                    bad_state_counts["wins"][k] + bad_state_counts["loss"][k]
                    for k in bad_state_counts["wins"]
                ])
                print("  negative memories added = {}".format(
                    invalid_action_count))
                print("  bad state distributions:")
                print("   from wins: {:9} from losses:".format(""))
                for code in bad_state_counts["wins"]:
                    print("   {:3} -> {:3} counts {:2} {:3} -> {:3} counts".
                          format(code, bad_state_counts["wins"][code], "",
                                 code, bad_state_counts["loss"][code]))
                print("  learner submissions: {}".format(
                    learner_submitted_counts))
                print("  model is saved in file: {}".format(out_path))
                print("***", flush=True)
            if (stash_model):
                if (i > 0 and (i + 1) % model_stash_interval == 0):
                    # Stash a copy of the current model
                    out_path = online_net.saver.save(
                        sess, "tmp/models/model_E{}.ckpt".format(i + 1))
                    print("Stashed a copy of the current model in {}".format(
                        out_path))

    stats = (loss_over_epochs, train_acc)
    return stats
示例#7
0
    def train_epoch(self):
        """
        Training loop for a single epoch
        """
        # We can't validate a winner for submissions generated by the learner,
        # so we will use a winner-less match when getting rewards for such states
        blank_match = {"winner":None}

        learner_submitted_actions = 0
        null_actions = 0

        # Shuffle match presentation order
        shuffled_matches = random.sample(self.training_data, len(self.training_data))
        for match in shuffled_matches:
            for team in self.teams:
                # Process match into individual experiences
                experiences = mp.process_match(match, team)
                for pick_id, experience in enumerate(experiences):
                    # Some experiences include NULL submissions (usually missing bans)
                    # The learner isn't allowed to submit NULL picks so skip adding these
                    # to the buffer.
                    state,actual,_,_ = experience
                    (cid,pos) = actual
                    if cid is None:
                        null_actions += 1
                        continue
                    # Store original experience
                    self.replay.store([experience])
                    self.step_count += 1

                    # Give model feedback on current estimations
                    if(self.step_count > self.observations):
                        # Let the network predict the next action
                        feed_dict = {self.ddq_net.online_ops["input"]:[state.format_state()],
                                     self.ddq_net.online_ops["valid_actions"]:[state.get_valid_actions()]}
                        q_vals = self.ddq_net.sess.run(self.ddq_net.online_ops["valid_outQ"], feed_dict=feed_dict)
                        sorted_actions = q_vals[0,:].argsort()[::-1]
                        top_actions = sorted_actions[0:4]

                        if(random.random() < self.epsilon):
                            pred_act = random.sample(list(top_actions), 1)
                        else:
                            # Use model's top prediction
                            pred_act = [sorted_actions[0]]

                        for action in pred_act:
                            (cid,pos) = state.format_action(action)
                            if((cid,pos)!=actual):
                                pred_state = deepcopy(state)
                                pred_state.update(cid,pos)
                                r = get_reward(pred_state, blank_match, (cid,pos), actual)
                                new_experience = (state, (cid,pos), r, pred_state)

                                self.replay.store([new_experience])
                                learner_submitted_actions += 1

                    if(self.epsilon > 0.1):
                        # Reduce epsilon over time
                        self.epsilon -= self.eps_decay_rate

                    # Use minibatch sample to update online network
                    if(self.step_count > self.pre_training_steps):
                        self.train_step()

                    if(self.step_count % self.target_update_frequency == 0):
                        # After the online network has been updated, update target network
                        _ = self.ddq_net.sess.run(self.ddq_net.target_ops["target_update"])

        # Get training loss, training_acc, and val_acc to return
        loss, train_acc = self.validate_model(self.training_data)
        _, val_acc = self.validate_model(self.validation_data)
        return (loss, train_acc, val_acc)
示例#8
0
def self_train(sess, explore_prob, n_experiences=1):
    """
    Runs model currently held in TF Session sess through one self training loop. Returns
    negative memory if model fails to complete draft.
    Args:
        sess (tf.Session()): TF Session used to run model.
        explore_prob (float): Probability that each pick will explore state space by submitting a random action
        n_experiences (int): Number of experiences desired.
    Returns:
        experiences [(s,a,r,s')]: list of expierence tuples from illegal submissions made by either side of draft
        None if network completes draft without illegal actions
    """
    MAX_DRAFT_ITERATIONS = 100  # Maximum number of drafts to iterate through
    assert n_experiences > 0, "Number of experiences must be non-negative"
    valid_champ_ids = cinfo.get_champion_ids()
    match = {"winner": None}  # Blank match for rewards processing
    # Two states are maintained: one corresponding to the perception of the draft
    # according to each of the teams.
    blue_state = DraftState(DraftState.BLUE_TEAM, valid_champ_ids)
    red_state = DraftState(DraftState.RED_TEAM, valid_champ_ids)
    # Draft dictionary holds states for each perspective
    draft = {0: blue_state, 1: red_state}

    online_pred = tf.get_default_graph().get_tensor_by_name(
        "online/prediction:0")
    online_input = tf.get_default_graph().get_tensor_by_name("online/inputs:0")
    online_secondary_input = tf.get_default_graph().get_tensor_by_name(
        "online/secondary_inputs:0")

    experiences = []
    successful_draft_count = 0
    while (len(experiences) < n_experiences):
        if (successful_draft_count > MAX_DRAFT_ITERATIONS):
            break
        blue_state.reset()
        red_state.reset()
        submission_count = 0
        while (blue_state.evaluate() != DraftState.DRAFT_COMPLETE
               and red_state.evaluate() != DraftState.DRAFT_COMPLETE):
            active_team = get_active_team(submission_count)
            inactive_team = 0 if active_team else 1

            state = draft[active_team]
            start = deepcopy(state)

            if (random.random() < explore_prob):
                # Explore state space by submitting random action
                pred_act = [random.randint(0, state.num_actions - 1)]
            else:
                pred_act = sess.run(online_pred,
                                    feed_dict={
                                        online_input: [state.format_state()],
                                        online_secondary_input:
                                        [state.format_secondary_inputs()]
                                    })
            action = state.format_action(pred_act[0])
            if (state.is_submission_legal(*action)):
                # Update active state
                state.update(*action)
                # Update inactive state, remembering to mask non-bans submitted by opponent
                (cid, pos) = action
                inactive_pos = pos if pos == -1 else 0
                draft[inactive_team].update(cid, inactive_pos)
                submission_count += 1
            else:
                bad_state = deepcopy(state)
                bad_state.update(*action)
                experiences.append(
                    (start, action, get_reward(bad_state, match, action,
                                               None), bad_state))
                break
        successful_draft_count += 1
    return experiences