Пример #1
0
					cumulative_reward = 0

			if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				current_time = math.floor(time.time()-start_time)
				print("Ep:", episode,
					"\tavg t: {0:.3f}".format(avg_time/print_episode),
					"\tavg score: {0:.3f}".format(avg_score/print_episode),
					"\terr {0:.3f}".format(avg_error/print_episode),
					"\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward
					"\tepsilon {0:.3f}".format(brain.EPSILON),
					end="")
				print_readable_time(current_time)

				# Save the model's weights and biases to .npz file
				model.save(sess, name=MODEL_NAME_save)
				# save_path = saver.save(sess, MODEL_PATH_SAVE)

				s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode})
				writer.add_summary(s, episode)

				avg_time = 0
				avg_score = 0
				avg_error = 0
				avg_reward = 0

		model.save(sess, verbose=True, name=MODEL_NAME_save)

		# save_path = saver.save(sess, MODEL_PATH_SAVE)
		# print("Model saved in path: %s" % save_path)
Пример #2
0
def train():

    MODEL_NAME = "diamond9_input5"
    MODEL_NAME_save = "diamond9_input5"

    FOLDER = "Best_Dojos9"

    MODEL_PATH_SAVE = "./Models/Tensorflow/" + FOLDER + "/" + MODEL_NAME_save + "/" + MODEL_NAME_save + ".ckpt"

    LOGDIR = "./Logs/" + FOLDER + "/" + MODEL_NAME_save + "_2"

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 8
    LOCAL_GRID_SIZE = 9
    MAP_NUMBER = 0
    RANDOMIZE_MAPS = False

    # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
    MAP_PATH = None

    print("\n ---- Training the Deep Neural Network ----- \n")

    RENDER_TO_SCREEN = False
    # RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=50,
                      food_count=10,
                      obstacle_count=0,
                      lava_count=0,
                      zombie_count=0,
                      history=0,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = Network(local_size=LOCAL_GRID_SIZE,
                    name=MODEL_NAME,
                    load=False,
                    path="./Models/Tensorflow/" + FOLDER + "/")

    brain = Brain(epsilon=0.1, action_space=env.number_of_actions())

    model.setup(brain)

    score = tf.placeholder(tf.float32, [])
    avg_t = tf.placeholder(tf.float32, [])
    epsilon = tf.placeholder(tf.float32, [])
    avg_r = tf.placeholder(tf.float32, [])

    tf.summary.scalar('error', tf.squeeze(model.error))
    tf.summary.scalar('score', score)
    tf.summary.scalar('average time', avg_t)
    tf.summary.scalar('epsilon', epsilon)
    tf.summary.scalar('avg reward', avg_r)

    avg_time = 0
    avg_score = 0
    avg_error = 0
    avg_reward = 0
    cumulative_reward = 0

    # Number of episodes
    print_episode = 100
    total_episodes = 10000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # Assume that you have 12GB of GPU memory and want to allocate ~4GB:
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")
        else:
            sess.run(init)

        # for episode in range(50):
        # 	state, info = env.reset()
        # 	done = False

        # 	if RENDER_TO_SCREEN:
        # 		env.render()

        # 	while not done:
        # 		action = brain.choose_action(state, sess, model)

        # 		new_state, reward, done, info = env.step(action)

        # 		brain.store_transition(state, action, reward, done, new_state)

        # 		state = new_state

        # 		if RENDER_TO_SCREEN:
        # 			env.render()

        # print("\nREPLAY MEMORY INITIALISED")
        # print(brain.memCntr)

        writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):

            if RANDOMIZE_MAPS:
                MAP_PATH = "./Maps/Grid10/map{}.txt".format(
                    np.random.randint(10))
                env.set_map(MAP_PATH)

            state, info = env.reset()
            done = False

            # brain.linear_epsilon_decay(total_episodes, episode, start=0.4, end=0.05, percentage=0.8)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                action = brain.choose_action(state, sess, model)
                # print(action)

                # Update environment by performing action
                new_state, reward, done, info = env.step(action)
                # print(new_state)

                brain.store_transition(state, action, reward, done, new_state)

                # e, Q_vector = brain.train_batch(4, model, sess)

                e, Q_vector = brain.train(model, sess)

                state = new_state

                cumulative_reward += reward

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e
                    avg_reward += cumulative_reward
                    cumulative_reward = 0

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print(
                    "Ep:",
                    episode,
                    "\tavg t: {0:.3f}".format(avg_time / print_episode),
                    "\tavg score: {0:.3f}".format(avg_score / print_episode),
                    "\tErr {0:.3f}".format(avg_error / print_episode),
                    "\tavg_reward {0:.3f}".format(
                        avg_reward / print_episode),  # avg cumulative reward
                    "\tepsilon {0:.3f}".format(brain.EPSILON),
                    end="")
                print_readable_time(current_time)

                # Save the model's weights and biases to .npz file
                model.save(sess, name=MODEL_NAME_save)
                # save_path = saver.save(sess, MODEL_PATH_SAVE)

                s = sess.run(merged_summary,
                             feed_dict={
                                 model.input: state,
                                 model.actions: Q_vector,
                                 score: avg_score / print_episode,
                                 avg_t: avg_time / print_episode,
                                 epsilon: brain.EPSILON,
                                 avg_r: avg_reward / print_episode
                             })
                writer.add_summary(s, episode)

                avg_time = 0
                avg_score = 0
                avg_error = 0
                avg_reward = 0

        model.save(sess, verbose=True, name=MODEL_NAME_save)

        # save_path = saver.save(sess, MODEL_PATH_SAVE)
        # print("Model saved in path: %s" % save_path)

        writer.close()
def main(env, args):
    # Fix random seeds and number of threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    if args.recodex:
        model = load_vae()
        best_params = np.load('best_params.npy', allow_pickle=True)
        # TODO: Perform evaluation of a trained model.
        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                # env.render()
                # TODO: Choose an action
                action = decide_action(model, state, best_params)
                state, reward, done, _ = env.step(action)

    elif args.DQN:
        network = Network(env, args)
        if os.path.exists('dqn.model'):
            network.model = tf.keras.models.load_model('dqn.model')
        vae = load_vae()
        replay_buffer = collections.deque(maxlen=100000)
        Transition = collections.namedtuple(
            "Transition", ["state", "action", "reward", "done", "next_state"])

        epsilon = 0.25
        gamma = 1

        for i in tqdm(range(10000)):
            state, done = env.reset(), False
            while not done:
                embedding = vae.get_latent_representation(np.array([state]))

                q_values = network.predict(embedding)[0]

                if np.random.uniform() >= epsilon:
                    action = np.argmax(q_values)
                else:
                    action = np.random.randint(0, env.action_space.n)

                next_state, reward, done, _ = env.step(action)

                replay_buffer.append(
                    Transition(
                        embedding, action, reward, done,
                        vae.get_latent_representation(np.array([next_state]))))

                if len(replay_buffer) > 32:
                    minibatch = random.sample(replay_buffer, 32)

                    states = np.array([t.state[0] for t in minibatch])
                    actions = np.array([t.action for t in minibatch])
                    rewards = np.array([t.reward for t in minibatch])
                    dones = np.array([t.done for t in minibatch])
                    next_states = np.array(
                        [t.next_state[0] for t in minibatch])

                    q_values = np.array(network.predict(states))
                    q_values_next = network.predict(next_states)

                    for Q, action, reward, next_Q, is_done in zip(
                            q_values, actions, rewards, q_values_next, dones):
                        Q[action] = reward + (0 if is_done else gamma *
                                              np.max(next_Q))

                    network.train(states, q_values)

                    if i % 100 == 0:
                        network.update_target_weights()

                    if i % 100 == 0:
                        network.save()

                state = next_state

            epsilon = np.exp(
                np.interp(env.episode + 1, [0, 5000],
                          [np.log(0.25), np.log(0.01)]))

    elif args.evolution:
        es = train(load_from='saved_model.pkl')
        np.save('best_params', es.best.get()[0])
        best_params = es.best.get()[0]
        play(best_params, render=True)
Пример #4
0
def train():

    MODEL_NAME = "diamond_local15_maps"

    MODEL_PATH_SAVE = "./Models/Tensorflow/Maps/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt"

    LOGDIR = "./Logs/" + MODEL_NAME

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 10
    LOCAL_GRID_SIZE = 15
    MAP_NUMBER = 0
    RANDOMIZE_MAPS = True

    # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
    MAP_PATH = None

    print("\n ---- Training the Deep Neural Network ----- \n")

    RENDER_TO_SCREEN = False
    RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=50,
                      food_count=3,
                      obstacle_count=1,
                      lava_count=1,
                      zombie_count=0,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = Network(local_size=LOCAL_GRID_SIZE,
                    name=MODEL_NAME,
                    load=False,
                    path="./Models/Tensorflow/Maps/")

    brain = Brain(epsilon=0.05, action_space=env.number_of_actions())

    model.setup(brain)

    tf.summary.scalar('error', tf.squeeze(model.error))

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # Number of episodes
    print_episode = 1000
    total_episodes = 100000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    # writer = tf.summary.FileWriter(LOGDIR)

    # Assume that you have 12GB of GPU memory and want to allocate ~4GB:
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")

        sess.run(init)

        # writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):

            if RANDOMIZE_MAPS:
                # Make a random map 0: lava, 1: obstacle
                MAP_PATH = "./Maps/Grid10/map{}.txt".format(
                    np.random.randint(10))
                env.set_map(MAP_PATH)

            state, info = env.reset()
            done = False

            brain.linear_epsilon_decay(total_episodes,
                                       episode,
                                       start=0.5,
                                       end=0.05,
                                       percentage=0.6)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                # Retrieve the Q values from the NN in vector form
                # Q_vector = sess.run(model.q_values, feed_dict={model.input: state})

                action = brain.choose_action(state, sess, model)

                # print(action)

                # Update environment by performing action
                new_state, reward, done, info = env.step(action)

                # print(new_state)

                brain.store_transition(state, action, reward, done, new_state)

                e = brain.train(model, sess)

                state = new_state

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print("Ep:",
                      episode,
                      "\tavg t: {0:.3f}".format(avg_time / print_episode),
                      "\tavg score: {0:.3f}".format(avg_score / print_episode),
                      "\tErr {0:.3f}".format(avg_error / print_episode),
                      "\tepsilon {0:.3f}".format(brain.EPSILON),
                      end="")
                print_readable_time(current_time)

                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npz file
                model.save(sess)
                save_path = saver.save(sess, MODEL_PATH_SAVE)

                # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector})
                # writer.add_summary(s, episode)

        model.save(sess, verbose=True)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)