示例#1
0
def trainNetwork(myAgent, sess):

    # Open up a game state to communicate with emulator.
    game_state = game.GameState()

    # Initialize the sate of the game.
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # Initialize the episose history.
    ep_history = []

    # Initialize a saver.
    saver = tf.train.Saver()

    # Initialize all variables.
    sess.run(tf.initialize_all_variables())

    # Restore the checkpoints.
    checkpoint = tf.train.get_checkpoint_state(
        "saved_networks_policy_gradient")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # Initialize the grad_buffer.
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0

    # Initialize the epsilon value for the exploration phase.
    epsilon = INITIAL_EPSILON

    # Initialize the iteration counter.
    t = 0
    score = []

    # For all episodes.
    while True:

        # Choose an action epsilon-greedily.
        readout_t = myAgent.readout.eval(
            feed_dict={myAgent.state_in: [s_t]})[0]
        action_index = get_action_index(readout_t, epsilon, t)
        a_t = np.zeros([ACTIONS])
        a_t[action_index] = 1

        # Scale down epsilon during the exploitation phase.
        epsilon = scale_down_epsilon(epsilon, t)

        for i in range(0, K):
            # Run the selected action and observe next state and reward.
            s_t1, r_t, terminal = run_selected_action(a_t, s_t, game_state)

            # Store the transition in the replay memory.
            ep_history.append([s_t, a_t, r_t, s_t1])
            if (terminal):
                score.append(game_state.bar1_pre - game_state.bar2_pre)

            if (terminal):
                break

        # If the episode is over
        if (terminal):

            s_j = [d[0] for d in ep_history]
            a_j = [d[1] for d in ep_history]
            r_j = [d[2] for d in ep_history]
            s_j1 = [d[3] for d in ep_history]

            # Compute the discounted reward
            r_j = discount_rewards(r_j)

            s_j = np.reshape(np.vstack(s_j), [-1, 80, 80, 4])

            feed_dict = {
                myAgent.reward_holder: r_j,
                myAgent.action_holder: a_j,
                myAgent.state_in: s_j
            }

            grads = sess.run(myAgent.gradients, feed_dict=feed_dict)

            for idx, grad in enumerate(grads):
                gradBuffer[idx] += grad

            feed_dict = dictionary = dict(
                zip(myAgent.gradient_holders, gradBuffer))
            _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)

            # Clean the grad buffer
            for ix, grad in enumerate(gradBuffer):
                gradBuffer[ix] = grad * 0

            ep_history = []

        # Update the state.
        s_t = s_t1

        # Update the number of iterations.
        t += 1

        # Save a checkpoint every 10000 iterations.
        if t % 10000 == 0:
            saver.save(sess,
                       'saved_networks_policy_gradient/' + GAME + '-dqn',
                       global_step=t)
        if t % 2000000 == 0:
            plt.plot(np.arange(len(score)), score)
            plt.ylabel('Total reward')
            plt.xlabel('episode')
            plt.savefig("policy_gradient.png")
            plt.show()

        # Print info.
        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", action_index,
              "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
示例#2
0
def trainNetwork(s, readout, sess):
    """ Train the artificial agent using Q-learning to play the pong game.
	Args:
		s: the current state formed by 4 frames of the playground.
		readout: the Q value for each passible action in the current state.
		sess: session
	"""

    # Placeholder for the action.

    a = tf.placeholder("float", [None, ACTIONS])

    # Placeholder for the target Q value.
    y = tf.placeholder("float", [None])

    # Compute the loss.
    cost = compute_cost(y, a, readout)

    # Training operation.
    train_step = tf.train.AdamOptimizer(Lr).minimize(cost)

    # Open up a game state to communicate with emulator.
    game_state = game.GameState()

    # Initialize the replay memory.
    D = deque()

    # Initialize the action vector.
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1

    # Initialize the state of the game.
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # Save and load model checkpoints.
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks_q_learning")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # Initialize the epsilon value for the exploration phase.
    epsilon = INITIAL_EPSILON

    # Initialize the iteration counter.
    t = 0
    score = []
    max_score = 0

    while True:

        # Choose an action epsilon-greedily.
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]

        action_index = get_action_index(readout_t, epsilon, t)

        a_t = np.zeros([ACTIONS])

        a_t[action_index] = 1

        # Scale down epsilon during the exploitation phase.
        epsilon = scale_down_epsilon(epsilon, t)

        #run the selected action and update the replay memeory

        for i in range(0, K):
            # Run the selected action and observe next state and reward.

            s_t1, r_t, terminal = run_selected_action(a_t, s_t, game_state)
            if (terminal):
                score.append(game_state.bar1_pre - game_state.bar2_pre)

            # Store the transition in the replay memory D.
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # Start training once the observation phase is over.
        if (t > OBSERVE):

            # Sample a minibatch to train on.
            minibatch = random.sample(D, BATCH)

            # Get the batch variables.
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            # Compute the target Q-Value
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})

            target_q_batch = compute_target_q(s_j1_batch, r_batch,
                                              readout_j1_batch, minibatch)

            # Perform gradient step.
            train_step.run(feed_dict={
                y: target_q_batch,
                a: a_batch,
                s: s_j_batch
            })

        # Update the state.
        s_t = s_t1

        # Update the number of iterations.
        t += 1

        # Save a checkpoint every 10000 iterations.
        if t % 10000 == 0:
            saver.save(sess,
                       'saved_networks_q_learning/' + GAME + '-dqn',
                       global_step=t)

        # Print info.
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        if t % 1000000 == 0:
            plt.plot(np.arange(len(score)), score)
            plt.ylabel('Total reward')
            plt.xlabel('episode')
            plt.savefig("q_learning.png")
            plt.show()

        print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon,
              "/ ACTION", action_index, "/ REWARD", r_t,
              "/ Q_MAX %e" % np.max(readout_t))
示例#3
0
def trainNetwork(s, readout, sess):
    """ Train the artificial agent using Q-learning to play the pong game.
    Args:
        s: the current state formed by 4 frames of the playground.
        readout: the Q value for each passible action in the current state.
        sess: session
    """

    # Training operation.
    train_step = tf.train.AdamOptimizer(Lr).minimize(cost)

    # Open up a game state to communicate with emulator.
    game_state = game.GameState()

    # Initialize the replay memory.
    D = deque()

    # Initialize the action vector.
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1

    # Initialize the state of the game.
    s_t = np.array([0.5, 0.5, 0.03, 0.01, 0.5 - paddle_height / 2])

    # Save and load model checkpoints.
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks_q_learning")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # Initialize the epsilon value for the exploration phase.
    epsilon = INITIAL_EPSILON

    # Initialize the iteration counter.
    t = 0

    while True:
        # Choose an action epsilon-greedily.
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]

        action_index = get_action_index(readout_t, epsilon, t)

        a_t = np.zeros([ACTIONS])

        a_t[action_index] = 1

        # Scale down epsilon during the exploitation phase.
        epsilon = scale_down_epsilon(epsilon, t)

        # Run the selected action and update the replay memeory
        for i in range(0, K):
            # Run the selected action and observe next state and reward.
            s_t1, r_t, terminal = run_selected_action(a_t, s_t, game_state)

            # Store the transition in the replay memory D.
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # Start training once the observation phase is over.
        if (t > OBSERVE):

            # Sample a minibatch to train on.
            minibatch = random.sample(D, BATCH)

            # Get the batch variables.
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]
            terminal_batch = [d[4] for d in minibatch]

            # Compute the target Q-Value
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            target_q_batch = compute_target_q(r_batch, readout_j1_batch,
                                              terminal_batch)

            # Perform gradient step.
            train_step.run(feed_dict={
                y: target_q_batch,
                a: a_batch,
                s: s_j_batch
            })

        # Update the state.
        s_t = s_t1

        # Update the number of iterations.
        t += 1

        # Save a checkpoint every 10000 iterations.
        if t % 10000 == 0:
            saver.save(sess,
                       'saved_networks_q_learning/' + GAME + '-dqn',
                       global_step=t)

        # Print info.
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon,
              "/ ACTION", action_index, "/ REWARD", r_t,
              "/ Q_MAX %e" % np.max(readout_t))