Exemplos de Environment em Python, exemplos de Malmo_Environment.Environment em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: minecraft_simulation_torch.py Projeto: Matthew-Reynard/malmo

def play():

    print()
    print("PLAYING THE MINECRAFT SIMULATION")
    print()

    GRID_SIZE = 7
    LOCAL_GRID_SIZE = 9  # Has to be an odd number (I think...)
    SEED = 1
    WRAP = False
    FOOD_COUNT = 1
    OBSTACLE_COUNT = 0
    # MAP_PATH = "./Maps/Grid{}/map2.txt".format(GRID_SIZE)
    MAP_PATH = None

    env = Environment(wrap=WRAP,
                      grid_size=GRID_SIZE,
                      rate=100,
                      food_count=FOOD_COUNT,
                      obstacle_count=OBSTACLE_COUNT,
                      lava_count=0,
                      zombie_count=0,
                      action_space=5,
                      map_path=MAP_PATH)

    env.play()

Exemplo n.º 2

0

Exibir arquivo

def play():
	print("\n ----- Playing the game -----\n")

	GRID_SIZE = 16
	LOCAL_GRID_SIZE = 15 # for printing out the state

	MAP_NUMBER = 4
	# MAP_NUMBER = np.random.randint(5)
	# MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
	# MAP_PATH = None
	MAP_PATH = "./Maps/Grid{}/impossible_map{}.txt".format(GRID_SIZE, MAP_NUMBER)

	env = Environment(wrap = False, 
					  grid_size = GRID_SIZE, 
					  local_size = LOCAL_GRID_SIZE,
					  rate = 100,
					  food_count = 0,
					  stick_count = 0,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 0,
					  history = 100,
					  action_space = 5,
					  map_path = MAP_PATH)

	env.play()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: minecraft_simulation_tf.py Projeto: Matthew-Reynard/malmo

def play():
    print("\n ----- Playing the game -----\n")

    GRID_SIZE = 16

    MAP_PATH = "./Maps/Grid{}/map4.txt".format(GRID_SIZE)
    # MAP_PATH = None

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      rate=100,
                      food_count=13,
                      obstacle_count=0,
                      lava_count=0,
                      zombie_count=0,
                      action_space=5,
                      map_path=MAP_PATH)

    env.play()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: minecraft_simulation_torch.py Projeto: Matthew-Reynard/malmo

def train():
    print()
    print("RUNNING THE MINECRAFT SIMULATION")
    print()

    RENDER = False
    # RENDER = True
    LOAD_MODEL = False
    # LOAD_MODEL = True
    start_eps = 0.8

    WRAP = False
    GRID_SIZE = 7
    LOCAL_GRID_SIZE = 9  # Has to be an odd number (I think...)
    SEED = 1
    FOOD_COUNT = 1
    OBSTACLE_COUNT = 0
    # MAP_PATH = "./Maps/Grid{}/map2.txt".format(GRID_SIZE)
    MAP_PATH = None

    env = Environment(wrap=WRAP,
                      grid_size=GRID_SIZE,
                      rate=80,
                      max_time=30,
                      food_count=FOOD_COUNT,
                      obstacle_count=OBSTACLE_COUNT,
                      lava_count=0,
                      zombie_count=0,
                      action_space=5,
                      map_path=MAP_PATH)

    brain = Agent(gamma=0.99,
                  epsilon=start_eps,
                  alpha=0.01,
                  maxMemorySize=10000,
                  replace=10)

    if LOAD_MODEL:
        try:
            path = "./Models/Torch/my_model.pth"
            brain.load_model(path)
            print("Model loaded from path:", path)
            print()
            brain.EPSILON = 0.05
        except Exception:
            print('Could not load model')
            print('Press <ENTER> to continue with random initialision')
            print()
            input()
            # quit()

    if RENDER: env.prerender()

    games_played = 0

    print("INITIALISING REPLAY MEMORY")

    while brain.memCntr < brain.memSize:
        observation, _ = env.reset()
        # print(observation)
        # observation = env.local_state_vector_3D()
        done = False

        if RENDER: env.render()  # Render first screen
        while not done:

            action = brain.chooseAction(observation)

            observation_, reward, done, info = env.step(action)
            # observation_ = env.local_state_vector_3D()
            # print(observation_)
            if done:
                # reward = -1
                games_played += 1
            brain.storeTransition(observation, action, reward, done,
                                  observation_)

            observation = observation_
            if RENDER: env.render()

    print("Done initialising replay memory. Played {} games".format(
        games_played))

    scores = []
    epsHistory = []
    numGames = 100000
    print_episode = 100
    batch_size = 16

    avg_score = 0
    avg_time = 0
    avg_loss = 0

    print()
    print("TRAINING MODEL")
    print()

    for i in range(numGames):
        epsHistory.append(brain.EPSILON)
        done = False
        observation, _ = env.reset()
        # observation = env.local_state_vector_3D()
        score = 0
        lastAction = 0

        if RENDER: env.render()  # Render first screen
        while not done:
            action = brain.chooseAction(observation)

            observation_, reward, done, info = env.step(action)

            # observation_ = env.local_state_vector_3D()
            # score += reward

            # print(observation_)

            brain.storeTransition(observation, action, reward, done,
                                  observation_)

            observation = observation_
            loss = brain.learn(batch_size)
            lastAction = action
            if RENDER: env.render()

        avg_score += info["score"]
        avg_time += info["time"]
        avg_loss += loss.item()

        if i % print_episode == 0 and not i == 0 or i == numGames - 1:
            print("Episode", i,
                  "\tavg time: {0:.3f}".format(avg_time / print_episode),
                  "\tavg score: {0:.3f}".format(avg_score / print_episode),
                  "\tavg loss: {0:.3f}".format(avg_loss / print_episode),
                  "\tepsilon: %.4f" % brain.EPSILON)
            brain.save_model("./Models/Torch/my_model{}.pth".format(i))
            avg_loss = 0
            avg_score = 0
            avg_time = 0

        # scores.append(score)
        # print("score:", score)

    brain.save_model("./Models/Torch/my_model.pth")

Exemplo n.º 5

0

Exibir arquivo

def run_MetaNetwork():

	print("\n ---- Running the Meta Network ----- \n")

	MODEL_NAME = "meta15_input6_4_unfrozen"
	DIAMOND_MODEL_NAME = "diamond15_input6_best_unfrozen4_300k"
	ZOMBIE_MODEL_NAME = "zombie15_input6_best_unfrozen4_300k"
	EXPLORE_MODEL_NAME = "explore15_input6_best_unfrozen4_300k"

	MODEL_PATH_SAVE = "./Models/Tensorflow/Meta/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt"

	LOGDIR = "./Logs/"+MODEL_NAME

	USE_SAVED_MODEL_FILE = False

	GRID_SIZE = 10
	LOCAL_GRID_SIZE = 15
	MAP_PATH = None

	RANDOMIZE_MAPS = True

	RENDER_TO_SCREEN = False
	RENDER_TO_SCREEN = True

	env = Environment(wrap = False, 
					  grid_size = GRID_SIZE,
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80, 
					  max_time = 100,
					  food_count = 10,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 2,
					  history = 40, 
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/Best_Meta/", load=True,  trainable = False)

	diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False)

	zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False)

	explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False)

	brain = Brain(epsilon=0.0, action_space=3)

	model.setup(brain)
	diamond_net.setup(brain)
	zombie_net.setup(brain)
	explore_net.setup(brain)

	avg_time = 0
	avg_score = 0
	avg_reward = 0
	cumulative_reward = 0

	# Number of episodes
	print_episode = 100
	total_episodes = 100

	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	init = tf.global_variables_initializer()

 	# GPU capabilities
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

	# Begin session
	with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_SAVE)
			print("Model restored.")
		else:
			sess.run(init)

		start_time = time.time()

		print("")

		for episode in range(total_episodes):

			if RANDOMIZE_MAPS:
				# Make a random map 0: lava, 1: obstacle
				MAP_PATH = "./Maps/Grid10/map{}.txt".format(np.random.randint(10))
				env.set_map(MAP_PATH)

			state, info = env.reset()
			done = False

			if RENDER_TO_SCREEN:
				env.render()

			while not done:

				dojo = brain.choose_action(state, sess, model)
				# print(dojo)

				if dojo == 0:
					dojo_state = state
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.0)
				elif dojo == 1:
					dojo_state = state
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.0)
				elif dojo == 2:
					dojo_state = state
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer
					action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.0)

				# print(action)

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)
				# print(new_state)

				state = new_state

				cumulative_reward += reward

				if RENDER_TO_SCREEN:
					env.render()

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_reward += cumulative_reward 
					cumulative_reward = 0

			if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				current_time = math.floor(time.time()-start_time)
				print("Ep:", episode,
					"\tavg t: {0:.3f}".format(avg_time/print_episode),
					"\tavg score: {0:.3f}".format(avg_score/print_episode),
					"\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward
					"\tepsilon {0:.3f}".format(brain.EPSILON),
					end="")
				print_readable_time(current_time)

				avg_time = 0
				avg_score = 0
				avg_reward = 0

Exemplo n.º 6

0

Exibir arquivo

def run():

	MODEL_NAME = "explore15_input6"

	FOLDER = "Best_Dojos"

	MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt"

	USE_SAVED_MODEL_FILE = False

	GRID_SIZE = 32
	LOCAL_GRID_SIZE = 15
	MAP_NUMBER = 0
	RANDOMIZE_MAPS = False

	# MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
	MAP_PATH = None
	MAP_PATH = "./Maps/Grid{}/impossible_map1.txt".format(GRID_SIZE, MAP_NUMBER)

	print("\n ---- Running the Deep Q Network ----- \n")

	RENDER_TO_SCREEN = False
	RENDER_TO_SCREEN = True

	env = Environment(wrap = False, 
					  grid_size = GRID_SIZE, 
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80, 
					  max_time = 60,
					  food_count = 0,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 0,
					  history = 40,
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True, path="./Models/Tensorflow/"+FOLDER+"/", trainable = False)

	brain = Brain(epsilon=0.0, action_space = env.number_of_actions())

	model.setup(brain)

	avg_time = 0
	avg_score = 0
	avg_reward = 0
	cumulative_reward = 0

	# Number of episodes
	print_episode = 100
	total_episodes = 100

	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	init = tf.global_variables_initializer()

	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

	# Begin session
	with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_SAVE)
			print("Model restored.")
		else:
			sess.run(init)

		print("")

		for episode in range(total_episodes):
			
			if RANDOMIZE_MAPS:
				MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, np.random.randint(10))
				env.set_map(MAP_PATH)

			# state, info = env.reset()
			state, info = env.quick_reset()
			done = False

			if RENDER_TO_SCREEN:
				env.render()

			while not done:

				action = brain.choose_action(state, sess, model)
				# print(action)

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)
				# print(new_state)

				state = new_state

				cumulative_reward += reward

				if RENDER_TO_SCREEN:
					env.render()

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_reward += cumulative_reward 
					cumulative_reward = 0

			if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				print("Ep:", episode,
					"\tavg t: {0:.3f}".format(avg_time/print_episode),
					"\tavg score: {0:.3f}".format(avg_score/print_episode),
					"\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward
					"\tepsilon {0:.3f}".format(brain.EPSILON),
					end="\n")

				avg_time = 0
				avg_score = 0
				avg_reward = 0

Exemplo n.º 7

0

Exibir arquivo

def train_MetaNetwork():

	print("\n ---- Training the Meta Network ----- \n")

	MODEL_NAME = "meta_grid16_zero_2"
	MODEL_NAME_save = "meta_grid16_zero_2"

	DIAMOND_MODEL_NAME = "diamond_grid16_4"
	ZOMBIE_MODEL_NAME = "zombie_grid16_2"
	EXPLORE_MODEL_NAME = "explore_grid16_2"
	# EXTRA_MODEL_NAME = "extra15_input6_2"

	# MODEL_NAME = "meta15_input6_1M_unfrozen_dojos"
	# DIAMOND_MODEL_NAME = "diamond15_input4_best_unfrozen_at_1M"
	# ZOMBIE_MODEL_NAME = "zombie15_input4_best_unfrozen_at_1M"
	# EXPLORE_MODEL_NAME = "explore15_input4_best_unfrozen_at_1M"

	# MODEL_NAME = "meta15_input6_1M_random_unfrozen_cointoss"
	# DIAMOND_MODEL_NAME = "diamond15_input4_1M_random_unfrozen_cointoss"
	# ZOMBIE_MODEL_NAME = "zombie15_input4_1M_random_unfrozen_cointoss"
	# EXPLORE_MODEL_NAME = "explore15_input4_1M_random_unfrozen_cointoss"k

	FOLDER = "Impossible"
	DOJO_FOLDER = "Impossible"

	MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt"

	LOGDIR = "./Logs/"+FOLDER+"/"+MODEL_NAME_save+""

	USE_SAVED_MODEL_FILE = False

	GRID_SIZE = 16
	LOCAL_GRID_SIZE = 15
	MAP_PATH = None

	RANDOMIZE_MAPS = True

	RENDER_TO_SCREEN = False
	# RENDER_TO_SCREEN = True

	env = Environment(wrap = False,
					  grid_size = GRID_SIZE,
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80,
					  max_time = 120,
					  food_count = 0,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 0,
					  history = 100,
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/"+FOLDER+"/", load=False,  trainable=True)
 
	diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False)

	zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False)

	explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False)

	# extra_net = Network(local_size=LOCAL_GRID_SIZE, name=EXTRA_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=False, trainable=True)

	brain = Brain(epsilon=0.05, action_space=3)

	model.setup(brain)
	diamond_net.setup(brain)
	zombie_net.setup(brain)
	explore_net.setup(brain)
	# extra_net.setup(brain)

	score = tf.placeholder(tf.float32, [])
	avg_t = tf.placeholder(tf.float32, [])
	epsilon = tf.placeholder(tf.float32, [])
	avg_r = tf.placeholder(tf.float32, [])

	tf.summary.scalar('error', tf.squeeze(model.error))
	tf.summary.scalar('score', score)
	tf.summary.scalar('average time', avg_t)
	tf.summary.scalar('epsilon', epsilon)
	tf.summary.scalar('avg reward', avg_r)

	avg_time = 0
	avg_score = 0
	avg_error = 0
	avg_reward = 0
	cumulative_reward = 0

	# Number of episodes
	print_episode = 1000
	total_episodes = 100000

	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	init = tf.global_variables_initializer()

	# Adds a summary graph of the error over time
	merged_summary = tf.summary.merge_all()

	# Tensorboard capabilties
	writer = tf.summary.FileWriter(LOGDIR)

	# Histogram
	histogram = Histogram(3, 10, total_episodes)

 	# GPU capabilities
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

	# Begin session
	with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_SAVE)
			print("Model restored.")
		else:
			sess.run(init)

		writer.add_graph(sess.graph)

		start_time = time.time()

		print("")

		for episode in range(total_episodes):

			if RANDOMIZE_MAPS:
				# Make a random map 0: lava, 1: obstacle
				MAP_PATH = "./Maps/Grid{}/impossible_map{}.txt".format(GRID_SIZE, np.random.randint(5))
				env.set_map(MAP_PATH)

			# state, info = env.reset()
			state, info = env.quick_reset()
			done = False

			# brain.linear_epsilon_decay(total_episodes, episode, start=1.0, end=0.05, percentage=0.5)

			# brain.linear_alpha_decay(total_episodes, episode)

			if RENDER_TO_SCREEN:
				env.render()

			while not done:

				# Retrieve the Q values from the NN in vector form
				Dojo_vector = sess.run(model.q_values, feed_dict={model.input: state})

				dojo = brain.choose_action(state, sess, model)
				
				histogram.check_section(episode)
				histogram.add(dojo)

				# dojo = np.random.randint(3)
				# dojo = 0

				# print(dojo)

				if dojo == 0:
					dojo_state = state
					# dojo_state[2]=0
					# dojo_state[3]=0
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.05)

				elif dojo == 1:
					dojo_state = state
					# dojo_state[1]=0
					# dojo_state[3]=0
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.05)

				elif dojo == 2:
					dojo_state = state
					# dojo_state[1]=0
					# dojo_state[2]=0
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer
					action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.05)

				# elif dojo == 3:
				# 	dojo_state = state
				# 	action = brain.choose_dojo(dojo_state, sess, extra_net, env.number_of_actions(), 0.05)

				# print(action)

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)

				# print(new_state)

				brain.store_transition_dojo(state, action, reward, done, new_state, dojo)

				# print(tf.trainable_variables(scope=None))

				# if dojo == 0:
				# 	e, Q_vector = brain.train_3_dojos(diamond_net, sess, dojo)

				# elif dojo == 1:
				# 	e, Q_vector = brain.train_3_dojos(zombie_net, sess, dojo)

				# elif dojo == 2:
				# 	e, Q_vector = brain.train_3_dojos(explore_net, sess, dojo)

				# e, Q_vector = brain.train_3(sess, diamond_net, zombie_net, explore_net)

				# e, Q_vector = brain.train(extra_net, sess)

				if done:
					Dojo_vector[:,dojo] = reward
					# print("Reward:", reward)
				else:
					# Gathering the now current state's action-value vector
					y_prime = sess.run(model.q_values, feed_dict={model.input: new_state})

					# Equation for training
					maxq = sess.run(model.y_prime_max, feed_dict={model.actions: y_prime})

					# RL Equation
					Dojo_vector[:,dojo] = reward + (brain.GAMMA * maxq)

				_, e = sess.run([model.optimizer, model.error], feed_dict={model.input: state, model.actions: Dojo_vector})

				state = new_state

				cumulative_reward += reward

				if RENDER_TO_SCREEN:
					env.render()

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_error += e
					avg_reward += cumulative_reward 
					cumulative_reward = 0

			if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				current_time = math.floor(time.time()-start_time)
				print("Ep:", episode,
					"\tavg t: {0:.3f}".format(avg_time/print_episode),
					"\tavg score: {0:.3f}".format(avg_score/print_episode),
					"\tErr {0:.3f}".format(avg_error/print_episode),
					"\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward
					"\tepsilon {0:.3f}".format(brain.EPSILON),
					end="")
				print_readable_time(current_time)

				# Save the model's weights and biases to .npz file
				model.save(sess, name=MODEL_NAME_save)
				# diamond_net.save(sess, name=DIAMOND_MODEL_NAME+"")
				# zombie_net.save(sess, name=ZOMBIE_MODEL_NAME+"")
				# explore_net.save(sess, name=EXPLORE_MODEL_NAME+"")
				# extra_net.save(sess, name=EXTRA_MODEL_NAME+"")

				# save_path = saver.save(sess, MODEL_PATH_SAVE)

				s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Dojo_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode})
				writer.add_summary(s, episode)

				avg_time = 0
				avg_score = 0
				avg_error = 0
				avg_reward = 0

		model.save(sess, verbose=True, name=MODEL_NAME_save)
		# diamond_net.save(sess, verbose=True, name=DIAMOND_MODEL_NAME+"")
		# zombie_net.save(sess, verbose=True, name=ZOMBIE_MODEL_NAME+"")
		# explore_net.save(sess, verbose=True, name=EXPLORE_MODEL_NAME+"")
		# extra_net.save(sess, verbose=True, name=EXTRA_MODEL_NAME+"")

		# save_path = saver.save(sess, MODEL_PATH_SAVE)
		# print("Model saved in path: %s" % save_path)

		writer.close()
		histogram.plot()

Exemplo n.º 8

0

Exibir arquivo

	# MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
	MAP_PATH = None
	# MAP_PATH = "./Maps/Grid{}/impossible_map0.txt".format(GRID_SIZE, MAP_NUMBER)

	print("\n ---- Training the Deep Neural Network ----- \n")

	RENDER_TO_SCREEN = False
	# RENDER_TO_SCREEN = True 

	env = Environment(wrap = False,
					  grid_size = GRID_SIZE,
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80,
					  max_time = 120,
					  food_count = 20,
					  stick_count = 0,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 0,
					  history = 0,
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True, path="./Models/Tensorflow/"+FOLDER+"/")

	brain = Brain(epsilon=0.1, action_space = env.number_of_actions())

	model.setup(brain)

Exemplo n.º 9

0

Exibir arquivo

def train():

    MODEL_NAME = "diamond9_input5"
    MODEL_NAME_save = "diamond9_input5"

    FOLDER = "Best_Dojos9"

    MODEL_PATH_SAVE = "./Models/Tensorflow/" + FOLDER + "/" + MODEL_NAME_save + "/" + MODEL_NAME_save + ".ckpt"

    LOGDIR = "./Logs/" + FOLDER + "/" + MODEL_NAME_save + "_2"

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 8
    LOCAL_GRID_SIZE = 9
    MAP_NUMBER = 0
    RANDOMIZE_MAPS = False

    # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
    MAP_PATH = None

    print("\n ---- Training the Deep Neural Network ----- \n")

    RENDER_TO_SCREEN = False
    # RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=50,
                      food_count=10,
                      obstacle_count=0,
                      lava_count=0,
                      zombie_count=0,
                      history=0,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = Network(local_size=LOCAL_GRID_SIZE,
                    name=MODEL_NAME,
                    load=False,
                    path="./Models/Tensorflow/" + FOLDER + "/")

    brain = Brain(epsilon=0.1, action_space=env.number_of_actions())

    model.setup(brain)

    score = tf.placeholder(tf.float32, [])
    avg_t = tf.placeholder(tf.float32, [])
    epsilon = tf.placeholder(tf.float32, [])
    avg_r = tf.placeholder(tf.float32, [])

    tf.summary.scalar('error', tf.squeeze(model.error))
    tf.summary.scalar('score', score)
    tf.summary.scalar('average time', avg_t)
    tf.summary.scalar('epsilon', epsilon)
    tf.summary.scalar('avg reward', avg_r)

    avg_time = 0
    avg_score = 0
    avg_error = 0
    avg_reward = 0
    cumulative_reward = 0

    # Number of episodes
    print_episode = 100
    total_episodes = 10000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # Assume that you have 12GB of GPU memory and want to allocate ~4GB:
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")
        else:
            sess.run(init)

        # for episode in range(50):
        # 	state, info = env.reset()
        # 	done = False

        # 	if RENDER_TO_SCREEN:
        # 		env.render()

        # 	while not done:
        # 		action = brain.choose_action(state, sess, model)

        # 		new_state, reward, done, info = env.step(action)

        # 		brain.store_transition(state, action, reward, done, new_state)

        # 		state = new_state

        # 		if RENDER_TO_SCREEN:
        # 			env.render()

        # print("\nREPLAY MEMORY INITIALISED")
        # print(brain.memCntr)

        writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):

            if RANDOMIZE_MAPS:
                MAP_PATH = "./Maps/Grid10/map{}.txt".format(
                    np.random.randint(10))
                env.set_map(MAP_PATH)

            state, info = env.reset()
            done = False

            # brain.linear_epsilon_decay(total_episodes, episode, start=0.4, end=0.05, percentage=0.8)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                action = brain.choose_action(state, sess, model)
                # print(action)

                # Update environment by performing action
                new_state, reward, done, info = env.step(action)
                # print(new_state)

                brain.store_transition(state, action, reward, done, new_state)

                # e, Q_vector = brain.train_batch(4, model, sess)

                e, Q_vector = brain.train(model, sess)

                state = new_state

                cumulative_reward += reward

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e
                    avg_reward += cumulative_reward
                    cumulative_reward = 0

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print(
                    "Ep:",
                    episode,
                    "\tavg t: {0:.3f}".format(avg_time / print_episode),
                    "\tavg score: {0:.3f}".format(avg_score / print_episode),
                    "\tErr {0:.3f}".format(avg_error / print_episode),
                    "\tavg_reward {0:.3f}".format(
                        avg_reward / print_episode),  # avg cumulative reward
                    "\tepsilon {0:.3f}".format(brain.EPSILON),
                    end="")
                print_readable_time(current_time)

                # Save the model's weights and biases to .npz file
                model.save(sess, name=MODEL_NAME_save)
                # save_path = saver.save(sess, MODEL_PATH_SAVE)

                s = sess.run(merged_summary,
                             feed_dict={
                                 model.input: state,
                                 model.actions: Q_vector,
                                 score: avg_score / print_episode,
                                 avg_t: avg_time / print_episode,
                                 epsilon: brain.EPSILON,
                                 avg_r: avg_reward / print_episode
                             })
                writer.add_summary(s, episode)

                avg_time = 0
                avg_score = 0
                avg_error = 0
                avg_reward = 0

        model.save(sess, verbose=True, name=MODEL_NAME_save)

        # save_path = saver.save(sess, MODEL_PATH_SAVE)
        # print("Model saved in path: %s" % save_path)

        writer.close()

Exemplo n.º 10

0

Exibir arquivo

def train():

    MODEL_NAME = "diamond_local15_maps"

    MODEL_PATH_SAVE = "./Models/Tensorflow/Maps/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt"

    LOGDIR = "./Logs/" + MODEL_NAME

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 10
    LOCAL_GRID_SIZE = 15
    MAP_NUMBER = 0
    RANDOMIZE_MAPS = True

    # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
    MAP_PATH = None

    print("\n ---- Training the Deep Neural Network ----- \n")

    RENDER_TO_SCREEN = False
    RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=50,
                      food_count=3,
                      obstacle_count=1,
                      lava_count=1,
                      zombie_count=0,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = Network(local_size=LOCAL_GRID_SIZE,
                    name=MODEL_NAME,
                    load=False,
                    path="./Models/Tensorflow/Maps/")

    brain = Brain(epsilon=0.05, action_space=env.number_of_actions())

    model.setup(brain)

    tf.summary.scalar('error', tf.squeeze(model.error))

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # Number of episodes
    print_episode = 1000
    total_episodes = 100000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    # writer = tf.summary.FileWriter(LOGDIR)

    # Assume that you have 12GB of GPU memory and want to allocate ~4GB:
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")

        sess.run(init)

        # writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):

            if RANDOMIZE_MAPS:
                # Make a random map 0: lava, 1: obstacle
                MAP_PATH = "./Maps/Grid10/map{}.txt".format(
                    np.random.randint(10))
                env.set_map(MAP_PATH)

            state, info = env.reset()
            done = False

            brain.linear_epsilon_decay(total_episodes,
                                       episode,
                                       start=0.5,
                                       end=0.05,
                                       percentage=0.6)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                # Retrieve the Q values from the NN in vector form
                # Q_vector = sess.run(model.q_values, feed_dict={model.input: state})

                action = brain.choose_action(state, sess, model)

                # print(action)

                # Update environment by performing action
                new_state, reward, done, info = env.step(action)

                # print(new_state)

                brain.store_transition(state, action, reward, done, new_state)

                e = brain.train(model, sess)

                state = new_state

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print("Ep:",
                      episode,
                      "\tavg t: {0:.3f}".format(avg_time / print_episode),
                      "\tavg score: {0:.3f}".format(avg_score / print_episode),
                      "\tErr {0:.3f}".format(avg_error / print_episode),
                      "\tepsilon {0:.3f}".format(brain.EPSILON),
                      end="")
                print_readable_time(current_time)

                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npz file
                model.save(sess)
                save_path = saver.save(sess, MODEL_PATH_SAVE)

                # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector})
                # writer.add_summary(s, episode)

        model.save(sess, verbose=True)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)

Exemplo n.º 11

0

Exibir arquivo

def train_MetaNetwork():

    print("\n ---- Training the Meta Network ----- \n")

    MODEL_NAME = "meta_network_local15"
    DIAMOND_MODEL_NAME = "diamond_dojo_local15"
    ZOMBIE_MODEL_NAME = "zombie_dojo_local15"
    # EXPLORE_MODEL_NAME = "explore_dojo_local15"

    MODEL_PATH_SAVE = "./Models/Tensorflow/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt"

    LOGDIR = "./Logs/" + MODEL_NAME

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 8
    LOCAL_GRID_SIZE = 15
    MAP_PATH = None

    RENDER_TO_SCREEN = False
    # RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=200,
                      food_count=3,
                      obstacle_count=0,
                      lava_count=0,
                      zombie_count=1,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True)

    diamond_net = Network(local_size=LOCAL_GRID_SIZE,
                          name=DIAMOND_MODEL_NAME,
                          load=True,
                          trainable=False)

    zombie_net = Network(local_size=LOCAL_GRID_SIZE,
                         name=ZOMBIE_MODEL_NAME,
                         load=True,
                         trainable=False)

    # explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, load=True, trainable = False)

    brain = Brain(epsilon=0.01, action_space=2)

    model.setup(brain)
    diamond_net.setup(brain)
    zombie_net.setup(brain)
    # explore_net.setup(brain)

    tf.summary.scalar('error', tf.squeeze(model.error))

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # Number of episodes
    print_episode = 1000
    total_episodes = 100000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # GPU capabilities
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")
        else:
            sess.run(init)

        writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):
            state, info = env.reset()
            done = False

            brain.linear_epsilon_decay(total_episodes,
                                       episode,
                                       start=0.3,
                                       end=0.02,
                                       percentage=0.5)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                # Retrieve the Q values from the NN in vector form
                Dojo_vector = sess.run(model.q_values,
                                       feed_dict={model.input: state})

                dojo = brain.choose_action(state, sess, model)

                # print(dojo)

                if dojo == 0:
                    # state[2] = 0 # Zero out the zombies layer
                    state = np.delete(state, 2, 0)  # Take out the zombie layer
                    state = np.delete(state, 5,
                                      0)  # Take out the history layer
                    action = brain.choose_dojo(state, sess, diamond_net,
                                               env.number_of_actions(), 0.01)
                elif dojo == 1:
                    # state[1] = 0 # Zero out the diamond layer
                    state = np.delete(state, 1,
                                      0)  # Take out the diamond layer
                    state = np.delete(state, 5,
                                      0)  # Take out the history layer
                    action = brain.choose_dojo(state, sess, zombie_net,
                                               env.number_of_actions(), 0.01)
                elif dojo == 2:
                    state = np.delete(state, 1,
                                      0)  # Take out the diamond layer
                    state = np.delete(state, 2, 0)  # Take out the zombie layer
                    action = brain.choose_dojo(state, sess, explore_net,
                                               env.number_of_actions(), 0.01)

                # print(action)

                # Update environment with by performing action
                new_state, reward, done, info = env.step(action)

                # print(new_state)

                brain.store_transition(state, dojo, reward, done, new_state)

                ## Standard training with learning after every step

                # print(tf.trainable_variables(scope=None))

                if done:
                    Dojo_vector[:, dojo] = reward
                    # print("Reward:", reward)
                else:
                    # Gathering the now current state's action-value vector
                    y_prime = sess.run(model.q_values,
                                       feed_dict={model.input: new_state})

                    # Equation for training
                    maxq = sess.run(model.y_prime_max,
                                    feed_dict={model.actions: y_prime})

                    # RL Equation
                    Dojo_vector[:, dojo] = reward + (brain.GAMMA * maxq)

                _, e = sess.run([model.optimizer, model.error],
                                feed_dict={
                                    model.input: state,
                                    model.actions: Dojo_vector
                                })

                ## Training using replay memory

                state = new_state

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print("Ep:",
                      episode,
                      "\tavg t: {0:.3f}".format(avg_time / print_episode),
                      "\tavg score: {0:.3f}".format(avg_score / print_episode),
                      "\tErr {0:.3f}".format(avg_error / print_episode),
                      "\tepsilon {0:.3f}".format(brain.EPSILON),
                      end="")
                print_readable_time(current_time)

                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npz file
                model.save(sess)
                save_path = saver.save(sess, MODEL_PATH_SAVE)

                s = sess.run(merged_summary,
                             feed_dict={
                                 model.input: state,
                                 model.actions: Dojo_vector
                             })
                writer.add_summary(s, episode)

        model.save(sess, verbose=True)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)

        writer.close()

Exemplo n.º 12

0

Exibir arquivo

Arquivo: minecraft_simulation_tf.py Projeto: Matthew-Reynard/malmo

def runDeepModel():

    # Testing
    print("\n ---- Running the Deep Neural Network ----- \n")

    # Decide whether or not to render to the screen or not
    RENDER_TO_SCREEN = True

    # True - Load model from modelpath_load; False - Initialise random weights
    USE_SAVED_MODEL_FILE = False

    # First we need our environment form Environment_for_DQN.py
    # has to have a grid_size of 10 for this current NN
    env = Environment(wrap=WRAP,
                      grid_size=GRID_SIZE,
                      rate=80,
                      max_time=200,
                      food_count=FOOD_COUNT,
                      obstacle_count=OBSTACLE_COUNT,
                      zombie_count=0,
                      action_space=5,
                      map_path=None)

    if RENDER_TO_SCREEN:
        env.prerender()

    # Hyper-parameters
    alpha = 0.01  # Learning rate, i.e. which fraction of the Q values should be updated
    gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards

    epsilon = 0.001  # Probability to choose random action instead of best action

    # Create NN model
    with tf.name_scope('Model'):
        Q_values, weights, biases = createDeepModel(x, load_variables=True)

    # Error / Loss function
    # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value
    with tf.name_scope('Error'):
        # e1 = tf.subtract(y, Q_values)
        # e2 = tf.square(e1)
        # error = tf.reduce_mean(e2, axis=1)

        # test
        error = tf.losses.mean_squared_error(labels=Q_values, predictions=y)

        # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1)
        # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1)
        # error = tf.reduce_max(tf.square(Q_values - y), axis=1)

    # Gradient descent optimizer - minimizes error/loss function
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
        # optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

    # The next states action-value [1,4] tensor, reduced to a scalar of the max value
    with tf.name_scope('Max_y_prime'):
        y_prime_max = tf.reduce_max(y, axis=1)

    # Action at time t, the index of the max value in the action-value tensor (Made a global variable)
    with tf.name_scope('Max_action'):
        action_t = tf.argmax(y, axis=1)

    avg_time = 0
    avg_score = 0
    avg_error = 0

    print_episode = 1
    total_episodes = 10

    # Saving model capabilities
    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Session can start running
    with tf.Session() as sess:

        # Restore the model, to keep training
        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_LOAD)
            print("Model restored.")

        sess.run(init)

        # Testing my DQN model with random values
        for episode in range(total_episodes):
            state, info = env.reset()
            done = False

            while not done:
                if RENDER_TO_SCREEN:
                    env.render()

                # One Hot representation of the current state
                state_vector = env.local_state_vector_3D()

                # Retrieve the Q values from the NN in vector form
                Q_vector = sess.run(Q_values, feed_dict={x: state_vector})
                # print("Qvector",Q_vector) # DEBUGGING

                # Deciding one which action to take
                if np.random.rand() <= epsilon:
                    action = env.sample_action()
                else:
                    # "action" is the max value of the Q values (output vector of NN)
                    action = sess.run(action_t, feed_dict={y: Q_vector})

                # Update environment with by performing action
                new_state, reward, done, info = env.step(action)
                # print(reward)

                state = new_state

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]

            if episode % print_episode == 0 and episode != 0:
                print("Ep:", episode, "\tavg t:", avg_time / print_episode,
                      "\tavg score:", avg_score / print_episode)
                avg_time = 0
                avg_score = 0

Exemplo n.º 13

0

Exibir arquivo

Arquivo: minecraft_simulation_tf.py Projeto: Matthew-Reynard/malmo

def trainDeepModel(load=False):

    # Used to see how long model takes to train - model needs to be optimized!
    start_time = time.time()

    print("\n ---- Training the Deep Neural Network ----- \n")

    # Decide whether or not to render to the screen or not
    RENDER_TO_SCREEN = False
    # RENDER_TO_SCREEN = True

    # True - Load model from modelpath_load; False - Initialise random weights
    USE_SAVED_MODEL_FILE = False

    # First we need our environment form Environment_for_DQN.py
    # has to have a grid_size of 10 for this current NN
    env = Environment(wrap=WRAP,
                      grid_size=5,
                      rate=80,
                      max_time=30,
                      food_count=3,
                      obstacle_count=0,
                      lava_count=0,
                      zombie_count=0,
                      action_space=5,
                      map_path=None)

    if RENDER_TO_SCREEN:
        env.prerender()

    # Hyper-parameters
    alpha = 0.001  # Learning rate, i.e. which fraction of the Q values should be updated
    gamma = 0.99  # Discount factor, i.e. to which extent the algorithm considers possible future rewards
    epsilon = 0.01  # Probability to choose random action instead of best action

    epsilon_function = True
    epsilon_start = 0.5
    epsilon_end = 0.05
    epsilon_percentage = 0.5  # in decimal

    alpha_function = False
    alpha_start = 0.01
    alpha_end = 0.003
    alpha_percentage = 0.9  # in decimal

    # Trajectory
    tau = []

    # Create NN model
    with tf.name_scope('Model'):
        Q_values, weights, biases = createDeepModel(x, load_variables=load)

    # Error / Loss function
    # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value
    with tf.name_scope('Error'):

        # test
        error = tf.losses.mean_squared_error(labels=Q_values, predictions=y)

        # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1)
        # error = tf.reduce_max(tf.square(Q_values - y), axis=1)

    tf.summary.scalar('error', tf.squeeze(error))

    # Gradient descent optimizer - minimizes error/loss function
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error)
        # optimizer = tf.train.AdamOptimizer(alpha).minimize(error)

    # The next states action-value [1,4] tensor, reduced to a scalar of the max value
    with tf.name_scope('Max_y_prime'):
        y_prime_max = tf.reduce_max(y, axis=1)

    # Action at time t, the index of the max value in the action-value tensor (Made a global variable)
    with tf.name_scope('Max_action'):
        action_t = tf.argmax(y, axis=1)

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # error plot
    # errors = []

    print_episode = 1000
    total_episodes = 100000

    # Saving model capabilities
    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # Session can start running
    with tf.Session() as sess:

        # Restore the model, to keep training
        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_LOAD)
            print("Model restored.")

        # Initialize global variables
        sess.run(init)

        # Tensorboard graph
        writer.add_graph(sess.graph)

        print("\nProgram took {0:.4f} seconds to initialise\n".format(
            time.time() - start_time))
        start_time = time.time()

        # Training
        for episode in range(total_episodes):
            state, info = env.reset()
            done = False

            # Linear function for alpha
            if alpha_function:
                alpha = (-alpha_start /
                         (alpha_percentage * total_episodes)) * episode + (
                             alpha_start + alpha_end)
                if alpha < alpha_end:
                    alpha = alpha_end

            # Linear function for epsilon
            if epsilon_function:
                epsilon = (-(epsilon_start - epsilon_end) /
                           (epsilon_percentage * total_episodes)) * episode + (
                               epsilon_start)
                if epsilon < epsilon_end:
                    epsilon = epsilon_end

            if RENDER_TO_SCREEN:
                env.render()

            while not done:
                # if RENDER_TO_SCREEN:
                # 	env.render()

                # Retrieve the Q values from the NN in vector form
                Q_vector = sess.run(Q_values, feed_dict={x: state})
                # print("Qvector", Q_vector) # DEBUGGING

                # Deciding one which action to take
                if np.random.rand() <= epsilon:
                    action = env.sample_action()
                else:
                    # "action" is the max value of the Q values (output vector of NN)
                    action = sess.run(action_t, feed_dict={y: Q_vector})

                # Update environment with by performing action
                new_state, reward, done, info = env.step(action)

                #'''
                ## Standard training with learning after every step

                # Q_vector = sess.run(Q_values, feed_dict={x: state})
                # if final state of the episode
                # print("Q_vector:", Q_vector)
                if done:
                    Q_vector[:, action] = reward
                    # print("Reward:", reward)
                else:
                    # Gathering the now current state's action-value vector
                    y_prime = sess.run(Q_values, feed_dict={x: new_state})

                    # Equation for training
                    maxq = sess.run(y_prime_max, feed_dict={y: y_prime})

                    # RL Equation
                    Q_vector[:, action] = reward + (gamma * maxq)

                _, e = sess.run([optimizer, error],
                                feed_dict={
                                    x: state,
                                    y: Q_vector
                                })

                #'''
                '''
				## Training using replay memory

				# Update trajectory (Update replay memory)
				if len(tau) < REPLAY_MEMORY:
					tau.append(Trajectory(state, action, reward, new_state, done))
				else:
					# print("tau is now full")
					tau.pop(0)
					tau.append(Trajectory(state, action, reward, new_state, done))
				
				# Choose a random step from the replay memory
				random_tau = np.random.randint(0, len(tau))

				# Get the Q vector of the training step
				Q_vector = sess.run(Q_values, feed_dict={x: tau[random_tau].state})
				
				# If terminating state of episode
				if tau[random_tau].done:
					# Set the chosen action's current value to the reward value
					Q_vector[:,tau[random_tau].action] = tau[random_tau].reward
				else:
					# Gets the Q vector of the new state
					y_prime = sess.run(Q_values, feed_dict={x: tau[random_tau].new_state})

					# Getting the best action value
					maxq = sess.run(y_prime_max, feed_dict={y: y_prime})

					# RL DQN Training Equation
					Q_vector[:,tau[random_tau].action] = tau[random_tau].reward + (gamma * maxq)

				_, e = sess.run([optimizer, error], feed_dict={x: tau[random_tau].state, y: Q_vector})
				
				'''

                # Add to the error list, to show the plot at the end of training - RAM OVERLOAD!!!
                # errors.append(e)

                state = new_state

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):
                current_time = math.floor(time.time() - start_time)
                print(
                    "Ep:",
                    episode,
                    "\tavg t: {0:.3f}".format(avg_time / print_episode),
                    "\tavg score: {0:.3f}".format(avg_score / print_episode),
                    "\tErr {0:.3f}".format(avg_error / print_episode),
                    "\tepsilon {0:.3f}".format(epsilon),
                    #"\ttime {0:.0f}:{1:.0f}".format(current_time/60, current_time%60),
                    end="")
                if current_time % 60 < 10:
                    if math.floor((current_time / 60) % 60) < 10:
                        print("\ttime {0:.0f}:0{1:.0f}:0{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                    else:
                        print("\ttime {0:.0f}:{1:.0f}:0{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                else:
                    if math.floor((current_time / 60) % 60) < 10:
                        print("\ttime {0:.0f}:0{1:.0f}:{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                    else:
                        print("\ttime {0:.0f}:{1:.0f}:{2:.0f}".format(
                            math.floor((current_time / 60) / 60),
                            math.floor((current_time / 60) % 60),
                            current_time % 60))
                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npy files (can't save 4D array to text file)
                W_conv1 = np.array(sess.run(weights['W_conv1']))
                W_conv2 = np.array(sess.run(weights['W_conv2']))
                W_fc = np.array(sess.run(weights['W_fc']))
                W_out = np.array(sess.run(weights['W_out']))

                b_conv1 = np.array(sess.run(biases['b_conv1']))
                b_conv2 = np.array(sess.run(biases['b_conv2']))
                b_fc = np.array(sess.run(biases['b_fc']))
                b_out = np.array(sess.run(biases['b_out']))

                np.save(W_conv1_textfile_path_save, W_conv1.astype(np.float32))
                np.save(W_conv2_textfile_path_save, W_conv2.astype(np.float32))
                np.save(W_fc_textfile_path_save, W_fc.astype(np.float32))
                np.save(W_out_textfile_path_save, W_out.astype(np.float32))

                np.save(b_conv1_textfile_path_save, b_conv1.astype(np.float32))
                np.save(b_conv2_textfile_path_save, b_conv2.astype(np.float32))
                np.save(b_fc_textfile_path_save, b_fc.astype(np.float32))
                np.save(b_out_textfile_path_save, b_out.astype(np.float32))

                s = sess.run(merged_summary, feed_dict={x: state, y: Q_vector})
                writer.add_summary(s, episode)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)