cumulative_reward = 0 if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1): current_time = math.floor(time.time()-start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time/print_episode), "\tavg score: {0:.3f}".format(avg_score/print_episode), "\terr {0:.3f}".format(avg_error/print_episode), "\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) # Save the model's weights and biases to .npz file model.save(sess, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode}) writer.add_summary(s, episode) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 model.save(sess, verbose=True, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) # print("Model saved in path: %s" % save_path)
def train(): MODEL_NAME = "diamond9_input5" MODEL_NAME_save = "diamond9_input5" FOLDER = "Best_Dojos9" MODEL_PATH_SAVE = "./Models/Tensorflow/" + FOLDER + "/" + MODEL_NAME_save + "/" + MODEL_NAME_save + ".ckpt" LOGDIR = "./Logs/" + FOLDER + "/" + MODEL_NAME_save + "_2" USE_SAVED_MODEL_FILE = False GRID_SIZE = 8 LOCAL_GRID_SIZE = 9 MAP_NUMBER = 0 RANDOMIZE_MAPS = False # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None print("\n ---- Training the Deep Neural Network ----- \n") RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=50, food_count=10, obstacle_count=0, lava_count=0, zombie_count=0, history=0, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=False, path="./Models/Tensorflow/" + FOLDER + "/") brain = Brain(epsilon=0.1, action_space=env.number_of_actions()) model.setup(brain) score = tf.placeholder(tf.float32, []) avg_t = tf.placeholder(tf.float32, []) epsilon = tf.placeholder(tf.float32, []) avg_r = tf.placeholder(tf.float32, []) tf.summary.scalar('error', tf.squeeze(model.error)) tf.summary.scalar('score', score) tf.summary.scalar('average time', avg_t) tf.summary.scalar('epsilon', epsilon) tf.summary.scalar('avg reward', avg_r) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 100 total_episodes = 10000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Assume that you have 12GB of GPU memory and want to allocate ~4GB: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) # for episode in range(50): # state, info = env.reset() # done = False # if RENDER_TO_SCREEN: # env.render() # while not done: # action = brain.choose_action(state, sess, model) # new_state, reward, done, info = env.step(action) # brain.store_transition(state, action, reward, done, new_state) # state = new_state # if RENDER_TO_SCREEN: # env.render() # print("\nREPLAY MEMORY INITIALISED") # print(brain.memCntr) writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: MAP_PATH = "./Maps/Grid10/map{}.txt".format( np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False # brain.linear_epsilon_decay(total_episodes, episode, start=0.4, end=0.05, percentage=0.8) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: action = brain.choose_action(state, sess, model) # print(action) # Update environment by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, action, reward, done, new_state) # e, Q_vector = brain.train_batch(4, model, sess) e, Q_vector = brain.train(model, sess) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e avg_reward += cumulative_reward cumulative_reward = 0 if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print( "Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tavg_reward {0:.3f}".format( avg_reward / print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) # Save the model's weights and biases to .npz file model.save(sess, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={ model.input: state, model.actions: Q_vector, score: avg_score / print_episode, avg_t: avg_time / print_episode, epsilon: brain.EPSILON, avg_r: avg_reward / print_episode }) writer.add_summary(s, episode) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 model.save(sess, verbose=True, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) # print("Model saved in path: %s" % save_path) writer.close()
def main(env, args): # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) if args.recodex: model = load_vae() best_params = np.load('best_params.npy', allow_pickle=True) # TODO: Perform evaluation of a trained model. while True: state, done = env.reset(start_evaluation=True), False while not done: # env.render() # TODO: Choose an action action = decide_action(model, state, best_params) state, reward, done, _ = env.step(action) elif args.DQN: network = Network(env, args) if os.path.exists('dqn.model'): network.model = tf.keras.models.load_model('dqn.model') vae = load_vae() replay_buffer = collections.deque(maxlen=100000) Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "done", "next_state"]) epsilon = 0.25 gamma = 1 for i in tqdm(range(10000)): state, done = env.reset(), False while not done: embedding = vae.get_latent_representation(np.array([state])) q_values = network.predict(embedding)[0] if np.random.uniform() >= epsilon: action = np.argmax(q_values) else: action = np.random.randint(0, env.action_space.n) next_state, reward, done, _ = env.step(action) replay_buffer.append( Transition( embedding, action, reward, done, vae.get_latent_representation(np.array([next_state])))) if len(replay_buffer) > 32: minibatch = random.sample(replay_buffer, 32) states = np.array([t.state[0] for t in minibatch]) actions = np.array([t.action for t in minibatch]) rewards = np.array([t.reward for t in minibatch]) dones = np.array([t.done for t in minibatch]) next_states = np.array( [t.next_state[0] for t in minibatch]) q_values = np.array(network.predict(states)) q_values_next = network.predict(next_states) for Q, action, reward, next_Q, is_done in zip( q_values, actions, rewards, q_values_next, dones): Q[action] = reward + (0 if is_done else gamma * np.max(next_Q)) network.train(states, q_values) if i % 100 == 0: network.update_target_weights() if i % 100 == 0: network.save() state = next_state epsilon = np.exp( np.interp(env.episode + 1, [0, 5000], [np.log(0.25), np.log(0.01)])) elif args.evolution: es = train(load_from='saved_model.pkl') np.save('best_params', es.best.get()[0]) best_params = es.best.get()[0] play(best_params, render=True)
def train(): MODEL_NAME = "diamond_local15_maps" MODEL_PATH_SAVE = "./Models/Tensorflow/Maps/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt" LOGDIR = "./Logs/" + MODEL_NAME USE_SAVED_MODEL_FILE = False GRID_SIZE = 10 LOCAL_GRID_SIZE = 15 MAP_NUMBER = 0 RANDOMIZE_MAPS = True # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None print("\n ---- Training the Deep Neural Network ----- \n") RENDER_TO_SCREEN = False RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=50, food_count=3, obstacle_count=1, lava_count=1, zombie_count=0, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=False, path="./Models/Tensorflow/Maps/") brain = Brain(epsilon=0.05, action_space=env.number_of_actions()) model.setup(brain) tf.summary.scalar('error', tf.squeeze(model.error)) avg_time = 0 avg_score = 0 avg_error = 0 # Number of episodes print_episode = 1000 total_episodes = 100000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties # writer = tf.summary.FileWriter(LOGDIR) # Assume that you have 12GB of GPU memory and want to allocate ~4GB: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") sess.run(init) # writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: # Make a random map 0: lava, 1: obstacle MAP_PATH = "./Maps/Grid10/map{}.txt".format( np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False brain.linear_epsilon_decay(total_episodes, episode, start=0.5, end=0.05, percentage=0.6) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: # Retrieve the Q values from the NN in vector form # Q_vector = sess.run(model.q_values, feed_dict={model.input: state}) action = brain.choose_action(state, sess, model) # print(action) # Update environment by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, action, reward, done, new_state) e = brain.train(model, sess) state = new_state if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npz file model.save(sess) save_path = saver.save(sess, MODEL_PATH_SAVE) # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector}) # writer.add_summary(s, episode) model.save(sess, verbose=True) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)