Пример #1
0
def running():

	game_state = game.Main()
	t = 0
	episode = 0
	total_reward = 0
	reward_array = []
	max_q_array = []
	time_line_q = []
	time_line_r = []

	# get the first state by doing nothing
	do_nothing = np.zeros(ACTIONS)
	do_nothing[1] = 1
	x_t, r_0, terminal, ball_x, bat_mid = game_state.frame_step(do_nothing)

	while t <= RUNNING:
		a_t = np.zeros([ACTIONS])
		action_index = 1
		if t % FRAME_PER_ACTION == 0:
			# choosing the human action with PROBABILITY
			if random.random() <= PROBABILITY:
				print("----------Human Action----------")
				if ball_x < bat_mid:
					a_t = [1, 0, 0]  # move to left
				elif ball_x > bat_mid:
					a_t = [0, 0, 1]  # move to right
				else:
					a_t = [0, 1, 0]  # do nothing
			else :
				print("----------Random Action----------")
				action_index = random.randrange(ACTIONS)
				a_t[random.randrange(ACTIONS)] = 1
		else:
			a_t[1] = 1  # do nothing
Пример #2
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                   reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.Main()

    # store the previous observations in replay memory
    D = deque()

    # printing
    # a_file = open("logs_" + GAME + "/readout.txt", 'w')
    # h_file = open("logs_" + GAME + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[1] = 1
    x_t, r_0, terminal, ball_x, bat_mid = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("DQN_saved_networks")

    # if checkpoint and checkpoint.model_checkpoint_path:
    #     saver.restore(sess, checkpoint.model_checkpoint_path)
    #     print("Successfully loaded:", checkpoint.model_checkpoint_path)
    # else:
    #     print("Could not find old network weights")

    # start training
    epsilon = INITIAL_EPSILON
    t = 0
    episode = 0
    total_reward = 0
    reward_array = []
    max_q_array = []
    time_line_q = []
    time_line_r = []

    while t <= OBSERVE + EXPLORE + TRAINING:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 1
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= epsilon:
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1
            else:
                action_index = np.argmax(readout_t)
                a_t[action_index] = 1
        else:
            a_t[1] = 1  # do nothing

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal, ball_x, bat_mid = game_state.frame_step(
            a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)),
                            cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)

        # store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            # plot aggregated reward per episode
            if not terminal:
                total_reward += r_t
            else:
                episode += 1
                time_line_r.append(episode)
                reward_array.append(total_reward)
                total_reward = 0

            # plot per 1000 frame
            if t % 1000 == 0:
                max_q_value = np.max(readout_t)
                max_q_array.append(max_q_value)
                time_line_q.append(t // 1000)

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                # if terminal, only equals reward
                if terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess,
                       'DQN_saved_networks/' + GAME + '-dqn',
                       global_step=t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, \
            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
            "/ Q_MAX %e" % np.max(readout_t))
        # write info to files
        '''
        if t % 10000 <= 100:
            a_file.write(",".join([str(x) for x in readout_t]) + '\n')
            h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n')
            cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1)
        '''
        # restore lists
        time_line_r_file = open('.\\results\lists\\5.5start\\time_line_r.txt',
                                'w')
        for word in time_line_r:
            time_line_r_file.write(str(word))
            time_line_r_file.write('\n')
        time_line_r_file.close()

        time_line_q_file = open('.\\results\lists\\5.5start\\time_line_q.txt',
                                'w')
        for word in time_line_q:
            time_line_q_file.write(str(word))
            time_line_q_file.write('\n')
        time_line_q_file.close()

        reward_array_file = open(
            '.\\results\lists\\5.5start\\reward_array.txt', 'w')
        for word in reward_array:
            reward_array_file.write(str(word))
            reward_array_file.write('\n')
        reward_array_file.close()

        max_q_array_file = open('.\\results\lists\\5.5start\\max_q_array.txt',
                                'w')
        for word in max_q_array:
            max_q_array_file.write(str(word))
            max_q_array_file.write('\n')
        max_q_array_file.close()

    # plot result
    plt.figure()
    plt.xlabel("step")
    plt.ylabel("max Q value")
    plt.title("DQN max Q value change")
    plt.plot(time_line_q, max_q_array)
    plt.savefig('./DQN_max_Q_value.png')

    plt.figure()
    plt.xlabel("episode")
    plt.ylabel("reward")
    plt.title("DQN reward per episode change")
    plt.plot(time_line_r, reward_array)
    plt.savefig('./QDN_reward.png')
    plt.show()
Пример #3
0
def trainNetwork(s, readout, W_fc1, W_fc2, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS], name='action')
    y = tf.placeholder("float", [None], name='q_next')
    tf.summary.histogram('q_next', y)
    with tf.name_scope('q_eval'):
        readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                       reduction_indices=1)
        tf.summary.histogram('fc2/output', readout_action)
    with tf.name_scope('loss'):
        cost = tf.reduce_mean(tf.square(y - readout_action))
        tf.summary.scalar('loss', cost)
    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # network difference
    regularize_lambda = 1.0
    regularizer = tf.contrib.layers.l2_regularizer(
        regularize_lambda)  # equal to tf.nn.l2_loss

    with tf.name_scope('weight'):
        last_W_fc1 = tf.Variable(tf.constant(0.0, shape=W_fc1.get_shape()))
        diff_W_fc1 = tf.contrib.layers.apply_regularization(
            regularizer, [W_fc1 - last_W_fc1])
        tf.summary.scalar('diff_W_fc1', diff_W_fc1)
        last_W_fc1_update = tf.assign(last_W_fc1, W_fc1)

        last_W_fc2 = tf.Variable(tf.constant(0.0, shape=W_fc2.get_shape()))
        diff_W_fc2 = tf.contrib.layers.apply_regularization(
            regularizer, [W_fc2 - last_W_fc2])
        tf.summary.scalar('diff_W_fc2', diff_W_fc2)
        last_W_fc2_update = tf.assign(last_W_fc2, W_fc2)

    # tensorboard output
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter(r"result/Exp10Graph/", sess.graph)

    # record the reward
    with tf.name_scope('reward_per_life'):
        reward = tf.Variable(0.0, name='reward')
        reward_sum = tf.summary.scalar('reward_per_life', reward)

    # record the reward every 1000 time steps
    with tf.name_scope('reward_per_10000_steps'):
        reward_step = tf.Variable(0.0, name='reward_step')
        reward_sum_step = tf.summary.scalar('reward_per_10000_steps',
                                            reward_step)

    # placeholder to record reward
    reward_count = tf.placeholder('float')
    zero = tf.Variable(0.0, name='zero')
    re_count = 0.0
    life_count = 1
    reward_update = tf.assign(reward, reward + reward_count)
    reward_fresh = tf.assign(reward, zero)

    # placeholder to record reward_step
    reward_count_step = tf.placeholder('float')
    re_count_step = 0.0
    reward_update_step = tf.assign(reward_step,
                                   reward_step + reward_count_step)
    reward_fresh_step = tf.assign(reward_step, zero)

    # record average max q value
    with tf.name_scope('50kper_average_qMax'):
        q_max = tf.Variable(0.0, name='q_max_average')
        q_max_sum = tf.summary.scalar('50kper_average_qMax', q_max)
    # placeholder to record q value
    q_max_count = tf.placeholder('float')
    q_count = 0.0
    batch_count = 0
    q_max_update = tf.assign(q_max, q_max_count)

    # open up a game state to communicate with emulator
    game_state = game.Main()

    # store the previous observations in replay memory
    D = deque()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[1] = 1
    x_t_colored, r_0, terminal, ball_x, bat_mid = game_state.frame_step(
        do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # variable to save pygame frame
    # game_frame = x_t_colored

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state(r"result/Exp10_saved_networks")

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    epsilon = INITIAL_EPSILON
    omega = INITIAL_OMEGA
    t = 0
    episode_reward = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[
            0]  # returns output of current input(images).
        a_t = np.zeros([ACTIONS])
        action_index = 1
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= omega and t > OBSERVE:
                print("----------Human Action----------")
                if ball_x < bat_mid:
                    a_t = [1, 0, 0]  # move to left
                elif ball_x > bat_mid:
                    a_t = [0, 0, 1]  # move to right
                else:
                    a_t = [0, 1, 0]  # do nothing
            elif random.random() <= epsilon:
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1
            else:
                action_index = np.argmax(readout_t)
                a_t[action_index] = 1
        else:
            a_t[1] = 1  # do nothing

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # scale down omega
        # if omega - epsilon > 0 and t > OBSERVE:
        #     omega -= (INITIAL_OMEGA - FINAL_OMEGA) / EXPLORE
        if t > OBSERVE:
            omega = INITIAL_OMEGA * (DECAY_RATE**(t / DECAY_STEPS))

        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal, ball_x1, bat_mid1 = game_state.frame_step(
            a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)),
                            cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)

        # store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # update pygame frame
        # game_frame = x_t1_colored

        episode_reward += r_t

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)  # experience replay.

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(
                feed_dict={s: s_j1_batch})  # readout_j1_batch: Q value?

            # average max q value
            batch_count += 1
            q_count += np.max(readout_t)
            BATCH_N = 50000
            if batch_count % BATCH_N == 0:
                sess.run(q_max_update,
                         feed_dict={q_max_count: float(q_count / BATCH_N)})
                qm = sess.run(q_max_sum)
                writer.add_summary(qm, batch_count)
                q_count = 0.0

            # total reward
            re_count += r_t
            re_count_step += r_t

            if terminal:
                sess.run(reward_update,
                         feed_dict={reward_count: float(re_count)})
                re = sess.run(reward_sum)
                writer.add_summary(re, life_count)
                sess.run(reward_fresh)
                re_count = 0
                life_count += 1

            if t % 10000 == 0:
                sess.run(reward_update_step,
                         feed_dict={reward_count_step: float(re_count_step)})
                re_step = sess.run(reward_sum_step)
                writer.add_summary(re_step, t * 10000)
                sess.run(reward_fresh_step)
                re_count_step = 0

            for i in range(0, len(minibatch)):
                ith_terminal = minibatch[i][4]
                # if terminal, only equals reward
                if ith_terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(
                feed_dict={  # feed back to update network.
                    y: y_batch,
                    a: a_batch,
                    s: s_j_batch
                })

            # update tensorboard data per 1000 steps
            if t % 1000 == 0:
                result = sess.run(merged,
                                  feed_dict={
                                      y: y_batch,
                                      a: a_batch,
                                      s: s_j_batch
                                  })
                writer.add_summary(result, t)

            # record network weight
            if t % 1000 == 0:
                sess.run([last_W_fc1_update, last_W_fc2_update])

        # update the old values
        s_t = s_t1
        t += 1
        ball_x = ball_x1
        bat_mid = bat_mid1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess,
                       'result/Exp10_saved_networks/' + GAME + '-dqn',
                       global_step=t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon,
              "/ ACTION", action_index, "/ REWARD", r_t, "/ EPISODE_REWARD",
              episode_reward, "/ Q_MAX %e" % np.max(readout_t), "/ TERMINAL",
              terminal)
        if terminal:
            episode_reward = 0