Пример #1
0
def trainDeepModel(load=False):

    # Used to see how long model takes to train - model needs to be optimized!
    start_time = time.time()

    print("\n ---- Training the Deep Neural Network ----- \n")

    # Decide whether or not to render to the screen or not
    RENDER_TO_SCREEN = False

    # True - Load model from modelpath_load; False - Initialise random weights
    USE_SAVED_MODEL_FILE = False

    # First we need our environment form Environment_for_DQN.py
    # has to have a grid_size of 10 for this current NN
    env = Environment(wrap=WRAP,
                      grid_size=GRID_SIZE,
                      rate=80,
                      max_time=100,
                      tail=TAIL,
                      action_space=4)

    if RENDER_TO_SCREEN:
        env.prerender()

    # Hyper-parameters
    alpha = 0.01  # Learning rate, i.e. which fraction of the Q values should be updated
    gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards
    epsilon = 0.1  # Probability to choose random action instead of best action

    epsilon_function = True
    epsilon_start = 0.5
    epsilon_end = 0.05
    epsilon_percentage = 0.5  # in decimal

    alpha_function = False
    alpha_start = 0.01
    alpha_end = 0.003
    alpha_percentage = 0.9  # in decimal

    # Create NN model
    with tf.name_scope('Model'):
        Q_values, weights, biases = createDeepModel(x, load_variables=load)

    # Error / Loss function
    # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value
    with tf.name_scope('Error'):

        # test
        error = tf.losses.mean_squared_error(labels=Q_values, predictions=y)

        # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1)
        # error = tf.reduce_max(tf.square(Q_values - y), axis=1)

    tf.summary.scalar('error', tf.squeeze(error))

    # Gradient descent optimizer - minimizes error/loss function
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
        # optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

    # The next states action-value [1,4] tensor, reduced to a scalar of the max value
    with tf.name_scope('Max_y_prime'):
        y_prime_max = tf.reduce_max(y, axis=1)

    # Action at time t, the index of the max value in the action-value tensor (Made a global variable)
    with tf.name_scope('Max_action'):
        action_t = tf.argmax(y, axis=1)

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # error plot
    # errors = []

    print_episode = 1000
    total_episodes = 100000

    # Saving model capabilities
    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # Session can start running
    with tf.Session() as sess:

        # Restore the model, to keep training
        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_LOAD)
            print("Model restored.")

        # Initialize global variables
        sess.run(init)

        # Tensorboard graph
        writer.add_graph(sess.graph)

        print("\nProgram took {0:.4f} seconds to initialise\n".format(
            time.time() - start_time))
        start_time = time.time()

        # Testing my DQN model with random values
        for episode in range(total_episodes):
            state, info = env.reset()
            done = False

            # Linear function for alpha
            if alpha_function:
                alpha = (-alpha_start /
                         (alpha_percentage * total_episodes)) * episode + (
                             alpha_start + alpha_end)
                if alpha < alpha_end:
                    alpha = alpha_end

            # Linear function for epsilon
            if epsilon_function:
                epsilon = (-(epsilon_start - epsilon_end) /
                           (epsilon_percentage * total_episodes)) * episode + (
                               epsilon_start)
                if epsilon < epsilon_end:
                    epsilon = epsilon_end

            while not done:
                if RENDER_TO_SCREEN:
                    env.render()

                # One Hot representation of the current state
                state_vector = env.state_vector_3D()

                # Retrieve the Q values from the NN in vector form
                Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
                # print("Qvector", Q_vector) # DEBUGGING

                # Deciding one which action to take
                if np.random.rand() <= epsilon:
                    action = env.sample_action()
                else:
                    # "action" is the max value of the Q values (output vector of NN)
                    action = sess.run(action_t, feed_dict={y: Q_vector})

                # Update environment with by performing action
                new_state, reward, done, info = env.step(action)

                state = new_state

                # if final state of the episode
                if done:
                    Q_vector[:, action] = reward
                    # print("Reward:", reward)
                else:
                    # Gathering the now current state's action-value vector
                    new_state_vector = env.state_vector_3D()
                    y_prime = sess.run(Q_values,
                                       feed_dict={x: new_state_vector})

                    # Equation for training
                    maxq = sess.run(y_prime_max, feed_dict={y: y_prime})

                    # RL Equation
                    Q_vector[:, action] = reward + (gamma * maxq)

                _, e = sess.run([optimizer, error],
                                feed_dict={
                                    x: state_vector,
                                    y: Q_vector
                                })
                # _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector})
                # e = sess.run(error,feed_dict={x:state_vector, y:Q_vector})
                # sess.run(optimizer)

                # DEBUGGING
                # print("action:", action)
                # print("y_prime:", y_prime)
                # print("max q value:", maxq)
                # print("new Q_vector:", Q_vector)
                # print("error tensor:", e)

                # add to the error list, to show the plot at the end of training - RAM OVERLOAD!!!
                # errors.append(e)

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):
                current_time = time.time() - start_time
                print(
                    "Ep:",
                    episode,
                    "\tavg t: {0:.3f}".format(avg_time / print_episode),
                    "\tavg score: {0:.3f}".format(avg_score / print_episode),
                    "\tErr {0:.3f}".format(avg_error / print_episode),
                    "\tepsilon {0:.3f}".format(epsilon),
                    # "\ttime {0:.0f}:{1:.0f}".format(current_time/60, current_time%60))
                    end="")
                if current_time % 60 < 10:
                    if math.floor((current_time / 60) % 60) < 10:
                        print("\ttime {0:.0f}:0{1:.0f}:0{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                    else:
                        print("\ttime {0:.0f}:{1:.0f}:0{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                else:
                    if math.floor((current_time / 60) % 60) < 10:
                        print("\ttime {0:.0f}:0{1:.0f}:{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                    else:
                        print("\ttime {0:.0f}:{1:.0f}:{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npy files (can't save 4D array to text file)
                W_conv1 = np.array(sess.run(weights['W_conv1']))
                W_conv2 = np.array(sess.run(weights['W_conv2']))
                W_fc = np.array(sess.run(weights['W_fc']))
                W_out = np.array(sess.run(weights['W_out']))

                b_conv1 = np.array(sess.run(biases['b_conv1']))
                b_conv2 = np.array(sess.run(biases['b_conv2']))
                b_fc = np.array(sess.run(biases['b_fc']))
                b_out = np.array(sess.run(biases['b_out']))

                np.save(W_conv1_textfile_path_save, W_conv1.astype(np.float32))
                np.save(W_conv2_textfile_path_save, W_conv2.astype(np.float32))
                np.save(W_fc_textfile_path_save, W_fc.astype(np.float32))
                np.save(W_out_textfile_path_save, W_out.astype(np.float32))

                np.save(b_conv1_textfile_path_save, b_conv1.astype(np.float32))
                np.save(b_conv2_textfile_path_save, b_conv2.astype(np.float32))
                np.save(b_fc_textfile_path_save, b_fc.astype(np.float32))
                np.save(b_out_textfile_path_save, b_out.astype(np.float32))

                s = sess.run(merged_summary,
                             feed_dict={
                                 x: state_vector,
                                 y: Q_vector
                             })
                writer.add_summary(s, episode)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)
Пример #2
0
def runDeepModel():

    # Testing
    print("\n ---- Running the Deep Neural Network ----- \n")

    # Decide whether or not to render to the screen or not
    RENDER_TO_SCREEN = True

    # True - Load model from modelpath_load; False - Initialise random weights
    USE_SAVED_MODEL_FILE = False

    # First we need our environment form Environment_for_DQN.py
    # has to have a grid_size of 10 for this current NN
    env = Environment(wrap=WRAP,
                      grid_size=GRID_SIZE,
                      rate=80,
                      max_time=100,
                      tail=TAIL,
                      action_space=4)

    if RENDER_TO_SCREEN:
        env.prerender()

    # Hyper-parameters
    alpha = 0.01  # Learning rate, i.e. which fraction of the Q values should be updated
    gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards

    epsilon = 0.001  # Probability to choose random action instead of best action

    # Create NN model
    with tf.name_scope('Model'):
        Q_values, weights, biases = createDeepModel(x, load_variables=True)

    # Error / Loss function
    # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value
    with tf.name_scope('Error'):
        # e1 = tf.subtract(y, Q_values)
        # e2 = tf.square(e1)
        # error = tf.reduce_mean(e2, axis=1)

        # test
        error = tf.losses.mean_squared_error(labels=Q_values, predictions=y)

        # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1)
        # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1)
        # error = tf.reduce_max(tf.square(Q_values - y), axis=1)

    # Gradient descent optimizer - minimizes error/loss function
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
        # optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

    # The next states action-value [1,4] tensor, reduced to a scalar of the max value
    with tf.name_scope('Max_y_prime'):
        y_prime_max = tf.reduce_max(y, axis=1)

    # Action at time t, the index of the max value in the action-value tensor (Made a global variable)
    with tf.name_scope('Max_action'):
        action_t = tf.argmax(y, axis=1)

    avg_time = 0
    avg_score = 0
    avg_error = 0

    print_episode = 1
    total_episodes = 10

    # Saving model capabilities
    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Session can start running
    with tf.Session() as sess:

        # Restore the model, to keep training
        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_LOAD)
            print("Model restored.")

        sess.run(init)

        # Testing my DQN model with random values
        for episode in range(total_episodes):
            state, info = env.reset()
            done = False

            while not done:
                if RENDER_TO_SCREEN:
                    env.render()

                # One Hot representation of the current state
                state_vector = env.state_vector_3D()

                # Retrieve the Q values from the NN in vector form
                Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
                # print("Qvector",Q_vector) # DEBUGGING

                # Deciding one which action to take
                if np.random.rand() <= epsilon:
                    action = env.sample_action()
                else:
                    # "action" is the max value of the Q values (output vector of NN)
                    action = sess.run(action_t, feed_dict={y: Q_vector})

                # Update environment with by performing action
                new_state, reward, done, info = env.step(action)

                state = new_state

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]

            if episode % print_episode == 0 and episode != 0:
                print("Ep:", episode, "\tavg t:", avg_time / print_episode,
                      "\tavg score:", avg_score / print_episode)
                avg_time = 0
                avg_score = 0
Пример #3
0
					  tail = TAIL, 
					  food_count = FOOD_COUNT,
					  obstacle_count = OBSTACLE_COUNT,
					  multiplier_count = 0,
					  action_space = 5,
					  map_path = MAP_PATH)

	brain = Agent(gamma = 0.99, epsilon = start_eps, alpha = 0.001, maxMemorySize = 5000, replace = None)

	# env.play()

	if RENDER: env.prerender()

	while brain.memCntr < brain.memSize:
		obs, _ = env.reset()
		observation = env.state_vector_3D()
		done = False
		while not done:
			action = env.sample_action()
			observation_, reward, done, info = env.step(action)
			observation_ = env.state_vector_3D()
			if done:
				reward = -1
			brain.storeTransition(observation, action, reward, observation_)

			observation = observation_
	print("Done initialising memory")

	scores = []
	epsHistory = []
	numGames = 100000