def eval(model_type=model_type, model_path=model_path):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    env = LunarLander()

    if model_type == 'policy':
        model = Policy(env.observation_dim, env.action_dim)
    elif model_type == 'dqn':
        model = Network(env.observation_dim, env.action_dim)
    model.to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    episodes = 50
    wins = 0
    frames = []
    fuel_left = []
    for i in range(episodes):
        if i % 10 == 0:
            print(f"On episode {i}")
        frame_count = 0

        env.reset()
        state = env.get_state()
        while True:
            frame_count += 1

            action = model(
                torch.tensor(state, dtype=torch.float32,
                             device=device).unsqueeze(0)).argmax()

            state, reward, done = env.step(action)

            if done:
                if env.won:
                    wins += 1
                    frames.append(frame_count)
                    fuel_left.append(env.rocket.fuel)
                break
        env.close()

    if wins > 0:
        print(f"wins: {wins}")
        print(f"mean frames on wins {np.mean(frames)}")
        print(f"std frames on wins {np.std(frames, ddof=1)}")
        print(f"min frames on wins {np.min(frames)}")
        print(f"max frames on wins {np.max(frames)}")

        print(f"mean fuel on wins {np.mean(fuel_left)}")
        print(f"std fuel on wins {np.std(fuel_left, ddof=1)}")
        print(f"min fuel on wins {np.min(fuel_left)}")
        print(f"max fuel on wins {np.max(fuel_left)}")
    else:
        print("The model had 0 wins. Statistics can't be calculated")
Пример #2
0
    def __init__(
            self,
            env,
            memory_size,
            batch_size,
            target_update=100,
            gamma=0.99,
            # replay parameters
            alpha=0.2,
            beta=0.6,
            prior_eps=1e-6,
            # Categorical DQN parameters
            v_min=0,
            v_max=200,
            atom_size=51,
            # N-step Learning
            n_step=3,
            start_train=32,
            save_weights=True,
            log=True,
            lr=0.001,
            seed=0,
            episodes=200):

        self.env = env

        obs_dim = self.env.observation_dim
        action_dim = self.env.action_dim

        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        self.lr = lr
        self.memory_size = memory_size
        self.seed = seed

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)

        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(obs_dim,
                                              memory_size,
                                              batch_size,
                                              alpha=alpha)

        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(obs_dim,
                                         memory_size,
                                         batch_size,
                                         n_step=n_step,
                                         gamma=gamma)

        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atom_size).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(obs_dim, action_dim, self.atom_size,
                           self.support).to(self.device)
        self.dqn_target = Network(obs_dim, action_dim, self.atom_size,
                                  self.support).to(self.device)

        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

        # transition to store in memory
        self.transition = list()

        self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10))

        self.start_train = start_train

        self.save_weights = save_weights

        self.time = datetime.datetime.now().timetuple()
        self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}"

        self.log = log
        self.episode_cnt = 0
        self.episodes = episodes

        if self.save_weights is True:
            self.create_save_directory()

        plt.ion()
Пример #3
0
class DQNAgent:
    def __init__(
            self,
            env,
            memory_size,
            batch_size,
            target_update=100,
            gamma=0.99,
            # replay parameters
            alpha=0.2,
            beta=0.6,
            prior_eps=1e-6,
            # Categorical DQN parameters
            v_min=0,
            v_max=200,
            atom_size=51,
            # N-step Learning
            n_step=3,
            start_train=32,
            save_weights=True,
            log=True,
            lr=0.001,
            seed=0,
            episodes=200):

        self.env = env

        obs_dim = self.env.observation_dim
        action_dim = self.env.action_dim

        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        self.lr = lr
        self.memory_size = memory_size
        self.seed = seed

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)

        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(obs_dim,
                                              memory_size,
                                              batch_size,
                                              alpha=alpha)

        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(obs_dim,
                                         memory_size,
                                         batch_size,
                                         n_step=n_step,
                                         gamma=gamma)

        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atom_size).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(obs_dim, action_dim, self.atom_size,
                           self.support).to(self.device)
        self.dqn_target = Network(obs_dim, action_dim, self.atom_size,
                                  self.support).to(self.device)

        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr)

        # transition to store in memory
        self.transition = list()

        self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10))

        self.start_train = start_train

        self.save_weights = save_weights

        self.time = datetime.datetime.now().timetuple()
        self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}"

        self.log = log
        self.episode_cnt = 0
        self.episodes = episodes

        if self.save_weights is True:
            self.create_save_directory()

        plt.ion()

    def create_save_directory(self):
        try:
            os.mkdir(self.path)
        except OSError:
            print("Creation of the directory %s failed" % self.path)
        else:
            print("Successfully created the directory %s " % self.path)

    def select_action(self, state):
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = self.dqn(torch.FloatTensor(state).to(
            self.device)).argmax()
        selected_action = selected_action.detach().cpu().numpy()

        self.transition = [state, selected_action]

        return selected_action

    def step(self, action):
        """Take an action and return the response of the env."""
        next_state, reward, done = self.env.step(action)

        self.transition += [reward, next_state, done]

        # N-step transition
        if self.use_n_step:
            one_step_transition = self.memory_n.store(*self.transition)
        # 1-step transition
        else:
            one_step_transition = self.transition

        # add a single step transition
        if one_step_transition:
            self.memory.store(*one_step_transition)

        return next_state, reward, done

    def update_model(self):
        """Update the model by gradient descent."""
        # PER needs beta to calculate weights
        samples = self.memory.sample_batch(self.beta)
        weights = torch.FloatTensor(samples["weights"].reshape(-1, 1)).to(
            self.device)
        indices = samples["indices"]

        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)

        # PER: importance sampling before average
        loss = torch.mean(elementwise_loss * weights)

        # N-step Learning loss
        # we are gonna combine 1-step loss and n-step loss so as to
        # prevent high-variance. The original rainbow employs n-step loss only.
        if self.use_n_step:
            gamma = self.gamma**self.n_step
            samples = self.memory_n.sample_batch_from_idxs(indices)
            elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
            elementwise_loss += elementwise_loss_n_loss

            # PER: importance sampling before average
            loss = torch.mean(elementwise_loss * weights)

        self.optimizer.zero_grad()
        loss.backward()
        # print(loss)
        clip_grad_norm_(self.dqn.parameters(), 10.0)
        self.optimizer.step()

        # PER: update priorities
        loss_for_prior = elementwise_loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)

        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()

        return loss.item()

    def train(self, num_frames, plotting_interval=100):
        """Train the agent."""

        if self.log:
            pass
            # config = {'gamma': self.gamma, 'log_interval': plotting_interval, 'learning_rate': self.lr,
            #           'directory': self.path, 'type': 'dqn', 'replay_memory': self.memory_size, 'environment': 'normal', 'seed': self.seed}
            # wandb.init(project='is_os', entity='pydqn', config=config, notes=self.env.reward_function, reinit=True, tags=['report'])
            # wandb.watch(self.dqn)

        self.env.reset()
        state = self.env.get_state()
        won = False
        update_cnt = 0
        losses = []
        scores = []
        score = 0
        frame_cnt = 0
        self.episode_cnt = 0

        for frame_idx in range(1, num_frames + 1):
            frame_cnt += 1
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

            fraction = min(frame_cnt / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            # if agent has trained 500 frames, terminate
            if frame_cnt == 500:
                done = True

            # if episode ends
            if done:
                if reward > 0:
                    won = True
                self.env.reset()
                state = self.env.get_state()
                self.episode_cnt += 1
                scores.append(score)
                score = 0
                frame_cnt = 0

            # if training is ready
            if len(self.memory) >= self.batch_size:
                loss = self.update_model()
                losses.append(loss)
                update_cnt += 1

                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update()

            # plotting
            if frame_idx % plotting_interval == 0:
                self._plot(frame_idx, scores, losses)

            if frame_idx % 1000 == 0:
                torch.save(self.dqn.state_dict(),
                           f'{self.path}/{frame_idx}.tar')
                print(f"model saved at:\n {self.path}/{frame_idx}.tar")

        # wandb.run.summary['won'] = won
        self.env.close()

    def _compute_dqn_loss(self, samples, gamma):
        """Return categorical dqn loss."""
        device = self.device  # for shortening the following lines
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["acts"]).to(device)
        reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device)
        done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device)

        # Categorical DQN algorithm
        delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

        with torch.no_grad():
            # Double DQN
            next_action = self.dqn(next_state).argmax(1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = next_dist[range(self.batch_size), next_action]

            t_z = reward + (1 - done) * gamma * self.support
            t_z = t_z.clamp(min=self.v_min, max=self.v_max)
            b = (t_z - self.v_min) / delta_z
            l = b.floor().long()
            u = b.ceil().long()

            offset = (torch.linspace(
                0, (self.batch_size - 1) * self.atom_size,
                self.batch_size).long().unsqueeze(1).expand(
                    self.batch_size, self.atom_size).to(self.device))

            proj_dist = torch.zeros(next_dist.size(), device=self.device)
            proj_dist.view(-1).index_add_(0, (l + offset).view(-1),
                                          (next_dist *
                                           (u.float() - b)).view(-1))
            proj_dist.view(-1).index_add_(0, (u + offset).view(-1),
                                          (next_dist *
                                           (b - l.float())).view(-1))

        dist = self.dqn.dist(state)
        log_p = torch.log(dist[range(self.batch_size), action])
        elementwise_loss = -(proj_dist * log_p).sum(1)

        return elementwise_loss

    def _target_hard_update(self):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())

    def _plot(self, frame_cnt, scores, losses):
        self.ax1.cla()
        self.ax1.set_title(
            f'frames: {frame_cnt} score: {np.mean(scores[-10:])}')
        self.ax1.plot(scores[-999:], color='red')
        self.ax2.cla()
        self.ax2.set_title(f'loss: {np.mean(losses[-10:])}')
        self.ax2.plot(losses[-999:], color='blue')
        plt.show()
        plt.pause(0.1)

        # needed for wandb to not log nans
        # if frame_cnt < self.start_train + 11:
        #     loss = 0
        # else:
        #     loss = np.mean(losses[-10:])

        if self.log:
            pass
Пример #4
0
def runMission(train=False, load_model=False):
    '''
    Run or train Deep Model
    Training method still needs to be added
    '''

    # Global timer - multi purpose
    start_time = time.time()

    print("\n ---- Running the Deep Q Network ----- \n")

    USE_SAVED_MODEL_FILE = False

    # agent_host, my_mission, my_mission_record, action_space = setupMinecraft()

    model = Network(local_size=LOCAL_GRID_SIZE,
                    name=MODEL_NAME,
                    path="./Models/Tensorflow/" + FOLDER + "/",
                    load=load_model,
                    trainable=train)

    brain = Brain(epsilon=0.1, action_space=5)

    model.setup(brain)

    tf.summary.scalar('error', tf.squeeze(model.error))

    avg_time = 0
    avg_score = 0
    avg_error = 0
    avg_reward = 0
    cumulative_reward = 0

    print_episode = 1
    total_episodes = 10

    # Saving model capabilities
    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # Assume that you have 12GB of GPU memory and want to allocate ~4GB:
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

    # Begin Session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        # Restore the model, to keep training
        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_CHECKPOINT)
            print("Model restored.")
        else:
            # Initialize global variables
            sess.run(init)

        # Tensorboard graph
        writer.add_graph(sess.graph)

        print("\nProgram took {0:.4f} seconds to initialise\n".format(
            time.time() - start_time))
        start_time = time.time()

        # Running mission
        for episode in range(total_episodes):
            agent_host, my_mission, my_mission_record, action_space = setupMinecraft(
            )
            world_state = reset(agent_host, my_mission, my_mission_record)
            score = 0
            done = False
            craft_sword = False

            # Getting first observation
            while True:
                world_state = agent_host.getWorldState()
                if world_state.number_of_observations_since_last_state > 0:
                    break

            msg = world_state.observations[-1].text
            observations = json.loads(msg)
            # grid = observations.get(u'floor9x9', 0)
            grid = observations.get(u'floor15x15', 0)
            score = observations.get(u'Hotbar_8_size', 0)
            nearby_entites = observations.get(u'nearby_entites', 0)
            diamonds = []
            zombies = []
            steve_pos = (0, 0)
            steve_life = 20

            for entity in nearby_entites:
                if entity["name"] == "diamond":
                    diamonds.append((entity["x"], entity["z"]))
                if entity["name"] == "steve":
                    steve_pos = ((entity["x"], entity["z"]))
                    steve_life = entity["life"]
                if entity["name"] == "Zombie":
                    zombies.append((entity["x"], entity["z"]))

            state = get_state(steve_pos, diamonds, zombies, grid)

            # brain.linear_epsilon_decay(total_episodes, episode, start=0.3, end=0.05, percentage=0.5)

            world_state = agent_host.getWorldState()
            while world_state.is_mission_running and not done:
                print("-", end="")
                time.sleep(0.01)

                action = brain.choose_action(state, sess, model)
                # print("action:", action_space[action])

                if craft_sword:
                    agent_host.sendCommand("craft diamond_sword")
                    done = True
                else:
                    agent_host.sendCommand(action_space[action])

                time.sleep(0.2)

                world_state = agent_host.getWorldState()

                for error in world_state.errors:
                    print("Error:", error.text)

                # Have we received any observations?
                if world_state.number_of_observations_since_last_state > 0:
                    # if world_state.number_of_observations_since_last_state > 0 and world_state.number_of_rewards_since_last_state > 0:

                    msg = world_state.observations[-1].text
                    observations = json.loads(msg)
                    # print("\n\n", observations, "\n\n")

                    grid = observations.get(u'floor15x15', 0)
                    score = observations.get(u'Hotbar_8_size', 0)
                    nearby_entites = observations.get(u'nearby_entites', 0)
                    diamonds = []
                    zombies = []

                    for entity in nearby_entites:
                        if entity["name"] == "diamond":
                            diamonds.append((entity["x"], entity["z"]))
                        if entity["name"] == "Steve":
                            steve_pos = ((entity["x"], entity["z"]))
                            steve_life = entity["life"]
                        if entity["name"] == "Zombie":
                            zombies.append((entity["x"], entity["z"]))

                    # Debugging - print the state
                    for i in range(6):
                        print(state[i])
                        print()

                    new_state = get_state(steve_pos, diamonds, zombies, grid)

                    # reward = world_state.rewards[-1].getValue()
                    # score += reward

                    # brain.store_transition(state, action, reward, done, new_state)

                    # e, Q_vector = brain.train(model, sess)

                    state = new_state

                    # cumulative_reward += reward

                    # print(score)
                    if score >= 6:
                        craft_sword = True

                    if steve_life != 20:
                        done = True

                    # if done:
                    #     avg_time += info["time"]
                    #     avg_score += info["score"]
                    #     avg_error += e
                    #     avg_reward += cumulative_reward
                    #     cumulative_reward = 0

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):
                current_time = math.floor(time.time() - start_time)
                print(
                    "Ep:",
                    episode,
                    "\tavg t: {0:.3f}".format(avg_time / print_episode),
                    "\tavg score: {0:.3f}".format(avg_score / print_episode),
                    "\terr {0:.3f}".format(avg_error / print_episode),
                    "\tavg_reward {0:.3f}".format(
                        avg_reward / print_episode),  # avg cumulative reward
                    "\tepsilon {0:.3f}".format(brain.EPSILON),
                    end="")
                print_readable_time(current_time)

                # Save the model's weights and biases to .npz file
                # model.save(sess)
                # save_path = saver.save(sess, MODEL_PATH_SAVE)

                # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode})
                # writer.add_summary(s, episode)

                avg_time = 0
                avg_score = 0
                avg_error = 0
                avg_reward = 0

        # model.save(sess, verbose=True)

        # save_path = saver.save(sess, MODEL_CHECKPOINT)
        # print("Model saved in path: %s" % save_path)

        writer.close()
Пример #5
0
def run_MetaNetwork():

	print("\n ---- Running the Meta Network ----- \n")

	MODEL_NAME = "meta15_input6_4_unfrozen"
	DIAMOND_MODEL_NAME = "diamond15_input6_best_unfrozen4_300k"
	ZOMBIE_MODEL_NAME = "zombie15_input6_best_unfrozen4_300k"
	EXPLORE_MODEL_NAME = "explore15_input6_best_unfrozen4_300k"

	MODEL_PATH_SAVE = "./Models/Tensorflow/Meta/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt"

	LOGDIR = "./Logs/"+MODEL_NAME

	USE_SAVED_MODEL_FILE = False

	GRID_SIZE = 10
	LOCAL_GRID_SIZE = 15
	MAP_PATH = None

	RANDOMIZE_MAPS = True

	RENDER_TO_SCREEN = False
	RENDER_TO_SCREEN = True

	env = Environment(wrap = False, 
					  grid_size = GRID_SIZE,
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80, 
					  max_time = 100,
					  food_count = 10,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 2,
					  history = 40, 
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/Best_Meta/", load=True,  trainable = False)

	diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False)

	zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False)

	explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False)

	brain = Brain(epsilon=0.0, action_space=3)

	model.setup(brain)
	diamond_net.setup(brain)
	zombie_net.setup(brain)
	explore_net.setup(brain)

	avg_time = 0
	avg_score = 0
	avg_reward = 0
	cumulative_reward = 0

	# Number of episodes
	print_episode = 100
	total_episodes = 100

	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	init = tf.global_variables_initializer()

 	# GPU capabilities
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

	# Begin session
	with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_SAVE)
			print("Model restored.")
		else:
			sess.run(init)

		start_time = time.time()

		print("")

		for episode in range(total_episodes):

			if RANDOMIZE_MAPS:
				# Make a random map 0: lava, 1: obstacle
				MAP_PATH = "./Maps/Grid10/map{}.txt".format(np.random.randint(10))
				env.set_map(MAP_PATH)

			state, info = env.reset()
			done = False

			if RENDER_TO_SCREEN:
				env.render()

			while not done:

				dojo = brain.choose_action(state, sess, model)
				# print(dojo)

				if dojo == 0:
					dojo_state = state
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.0)
				elif dojo == 1:
					dojo_state = state
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.0)
				elif dojo == 2:
					dojo_state = state
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer
					action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.0)

				# print(action)

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)
				# print(new_state)

				state = new_state

				cumulative_reward += reward

				if RENDER_TO_SCREEN:
					env.render()

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_reward += cumulative_reward 
					cumulative_reward = 0

			if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				current_time = math.floor(time.time()-start_time)
				print("Ep:", episode,
					"\tavg t: {0:.3f}".format(avg_time/print_episode),
					"\tavg score: {0:.3f}".format(avg_score/print_episode),
					"\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward
					"\tepsilon {0:.3f}".format(brain.EPSILON),
					end="")
				print_readable_time(current_time)

				avg_time = 0
				avg_score = 0
				avg_reward = 0
Пример #6
0
def run():

	MODEL_NAME = "explore15_input6"

	FOLDER = "Best_Dojos"

	MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt"

	USE_SAVED_MODEL_FILE = False

	GRID_SIZE = 32
	LOCAL_GRID_SIZE = 15
	MAP_NUMBER = 0
	RANDOMIZE_MAPS = False

	# MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
	MAP_PATH = None
	MAP_PATH = "./Maps/Grid{}/impossible_map1.txt".format(GRID_SIZE, MAP_NUMBER)

	print("\n ---- Running the Deep Q Network ----- \n")

	RENDER_TO_SCREEN = False
	RENDER_TO_SCREEN = True

	env = Environment(wrap = False, 
					  grid_size = GRID_SIZE, 
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80, 
					  max_time = 60,
					  food_count = 0,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 0,
					  history = 40,
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True, path="./Models/Tensorflow/"+FOLDER+"/", trainable = False)

	brain = Brain(epsilon=0.0, action_space = env.number_of_actions())

	model.setup(brain)

	avg_time = 0
	avg_score = 0
	avg_reward = 0
	cumulative_reward = 0

	# Number of episodes
	print_episode = 100
	total_episodes = 100

	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	init = tf.global_variables_initializer()

	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

	# Begin session
	with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_SAVE)
			print("Model restored.")
		else:
			sess.run(init)

		print("")

		for episode in range(total_episodes):
			
			if RANDOMIZE_MAPS:
				MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, np.random.randint(10))
				env.set_map(MAP_PATH)

			# state, info = env.reset()
			state, info = env.quick_reset()
			done = False

			if RENDER_TO_SCREEN:
				env.render()

			while not done:

				action = brain.choose_action(state, sess, model)
				# print(action)

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)
				# print(new_state)

				state = new_state

				cumulative_reward += reward

				if RENDER_TO_SCREEN:
					env.render()

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_reward += cumulative_reward 
					cumulative_reward = 0

			if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				print("Ep:", episode,
					"\tavg t: {0:.3f}".format(avg_time/print_episode),
					"\tavg score: {0:.3f}".format(avg_score/print_episode),
					"\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward
					"\tepsilon {0:.3f}".format(brain.EPSILON),
					end="\n")

				avg_time = 0
				avg_score = 0
				avg_reward = 0
Пример #7
0
def train_MetaNetwork():

	print("\n ---- Training the Meta Network ----- \n")

	MODEL_NAME = "meta_grid16_zero_2"
	MODEL_NAME_save = "meta_grid16_zero_2"

	DIAMOND_MODEL_NAME = "diamond_grid16_4"
	ZOMBIE_MODEL_NAME = "zombie_grid16_2"
	EXPLORE_MODEL_NAME = "explore_grid16_2"
	# EXTRA_MODEL_NAME = "extra15_input6_2"

	# MODEL_NAME = "meta15_input6_1M_unfrozen_dojos"
	# DIAMOND_MODEL_NAME = "diamond15_input4_best_unfrozen_at_1M"
	# ZOMBIE_MODEL_NAME = "zombie15_input4_best_unfrozen_at_1M"
	# EXPLORE_MODEL_NAME = "explore15_input4_best_unfrozen_at_1M"

	# MODEL_NAME = "meta15_input6_1M_random_unfrozen_cointoss"
	# DIAMOND_MODEL_NAME = "diamond15_input4_1M_random_unfrozen_cointoss"
	# ZOMBIE_MODEL_NAME = "zombie15_input4_1M_random_unfrozen_cointoss"
	# EXPLORE_MODEL_NAME = "explore15_input4_1M_random_unfrozen_cointoss"k

	FOLDER = "Impossible"
	DOJO_FOLDER = "Impossible"

	MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt"

	LOGDIR = "./Logs/"+FOLDER+"/"+MODEL_NAME_save+""

	USE_SAVED_MODEL_FILE = False

	GRID_SIZE = 16
	LOCAL_GRID_SIZE = 15
	MAP_PATH = None

	RANDOMIZE_MAPS = True

	RENDER_TO_SCREEN = False
	# RENDER_TO_SCREEN = True

	env = Environment(wrap = False,
					  grid_size = GRID_SIZE,
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80,
					  max_time = 120,
					  food_count = 0,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 0,
					  history = 100,
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/"+FOLDER+"/", load=False,  trainable=True)
 
	diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False)

	zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False)

	explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False)

	# extra_net = Network(local_size=LOCAL_GRID_SIZE, name=EXTRA_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=False, trainable=True)

	brain = Brain(epsilon=0.05, action_space=3)

	model.setup(brain)
	diamond_net.setup(brain)
	zombie_net.setup(brain)
	explore_net.setup(brain)
	# extra_net.setup(brain)

	score = tf.placeholder(tf.float32, [])
	avg_t = tf.placeholder(tf.float32, [])
	epsilon = tf.placeholder(tf.float32, [])
	avg_r = tf.placeholder(tf.float32, [])

	tf.summary.scalar('error', tf.squeeze(model.error))
	tf.summary.scalar('score', score)
	tf.summary.scalar('average time', avg_t)
	tf.summary.scalar('epsilon', epsilon)
	tf.summary.scalar('avg reward', avg_r)

	avg_time = 0
	avg_score = 0
	avg_error = 0
	avg_reward = 0
	cumulative_reward = 0

	# Number of episodes
	print_episode = 1000
	total_episodes = 100000

	saver = tf.train.Saver()

	# Initialising all variables (weights and biases)
	init = tf.global_variables_initializer()

	# Adds a summary graph of the error over time
	merged_summary = tf.summary.merge_all()

	# Tensorboard capabilties
	writer = tf.summary.FileWriter(LOGDIR)

	# Histogram
	histogram = Histogram(3, 10, total_episodes)

 	# GPU capabilities
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

	# Begin session
	with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

		if USE_SAVED_MODEL_FILE:
			saver.restore(sess, MODEL_PATH_SAVE)
			print("Model restored.")
		else:
			sess.run(init)

		writer.add_graph(sess.graph)

		start_time = time.time()

		print("")

		for episode in range(total_episodes):

			if RANDOMIZE_MAPS:
				# Make a random map 0: lava, 1: obstacle
				MAP_PATH = "./Maps/Grid{}/impossible_map{}.txt".format(GRID_SIZE, np.random.randint(5))
				env.set_map(MAP_PATH)

			# state, info = env.reset()
			state, info = env.quick_reset()
			done = False

			# brain.linear_epsilon_decay(total_episodes, episode, start=1.0, end=0.05, percentage=0.5)

			# brain.linear_alpha_decay(total_episodes, episode)

			if RENDER_TO_SCREEN:
				env.render()

			while not done:

				# Retrieve the Q values from the NN in vector form
				Dojo_vector = sess.run(model.q_values, feed_dict={model.input: state})

				dojo = brain.choose_action(state, sess, model)
				
				histogram.check_section(episode)
				histogram.add(dojo)

				# dojo = np.random.randint(3)
				# dojo = 0

				# print(dojo)

				if dojo == 0:
					dojo_state = state
					# dojo_state[2]=0
					# dojo_state[3]=0
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.05)

				elif dojo == 1:
					dojo_state = state
					# dojo_state[1]=0
					# dojo_state[3]=0
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer
					action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.05)

				elif dojo == 2:
					dojo_state = state
					# dojo_state[1]=0
					# dojo_state[2]=0
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer
					# dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer
					action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.05)

				# elif dojo == 3:
				# 	dojo_state = state
				# 	action = brain.choose_dojo(dojo_state, sess, extra_net, env.number_of_actions(), 0.05)

				# print(action)

				# Update environment with by performing action
				new_state, reward, done, info = env.step(action)

				# print(new_state)

				brain.store_transition_dojo(state, action, reward, done, new_state, dojo)

				# print(tf.trainable_variables(scope=None))

				# if dojo == 0:
				# 	e, Q_vector = brain.train_3_dojos(diamond_net, sess, dojo)

				# elif dojo == 1:
				# 	e, Q_vector = brain.train_3_dojos(zombie_net, sess, dojo)

				# elif dojo == 2:
				# 	e, Q_vector = brain.train_3_dojos(explore_net, sess, dojo)

				# e, Q_vector = brain.train_3(sess, diamond_net, zombie_net, explore_net)

				# e, Q_vector = brain.train(extra_net, sess)

				if done:
					Dojo_vector[:,dojo] = reward
					# print("Reward:", reward)
				else:
					# Gathering the now current state's action-value vector
					y_prime = sess.run(model.q_values, feed_dict={model.input: new_state})

					# Equation for training
					maxq = sess.run(model.y_prime_max, feed_dict={model.actions: y_prime})

					# RL Equation
					Dojo_vector[:,dojo] = reward + (brain.GAMMA * maxq)

				_, e = sess.run([model.optimizer, model.error], feed_dict={model.input: state, model.actions: Dojo_vector})

				state = new_state

				cumulative_reward += reward

				if RENDER_TO_SCREEN:
					env.render()

				if done:
					avg_time += info["time"]
					avg_score += info["score"]
					avg_error += e
					avg_reward += cumulative_reward 
					cumulative_reward = 0

			if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1):
				
				current_time = math.floor(time.time()-start_time)
				print("Ep:", episode,
					"\tavg t: {0:.3f}".format(avg_time/print_episode),
					"\tavg score: {0:.3f}".format(avg_score/print_episode),
					"\tErr {0:.3f}".format(avg_error/print_episode),
					"\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward
					"\tepsilon {0:.3f}".format(brain.EPSILON),
					end="")
				print_readable_time(current_time)

				# Save the model's weights and biases to .npz file
				model.save(sess, name=MODEL_NAME_save)
				# diamond_net.save(sess, name=DIAMOND_MODEL_NAME+"")
				# zombie_net.save(sess, name=ZOMBIE_MODEL_NAME+"")
				# explore_net.save(sess, name=EXPLORE_MODEL_NAME+"")
				# extra_net.save(sess, name=EXTRA_MODEL_NAME+"")

				# save_path = saver.save(sess, MODEL_PATH_SAVE)

				s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Dojo_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode})
				writer.add_summary(s, episode)

				avg_time = 0
				avg_score = 0
				avg_error = 0
				avg_reward = 0

		model.save(sess, verbose=True, name=MODEL_NAME_save)
		# diamond_net.save(sess, verbose=True, name=DIAMOND_MODEL_NAME+"")
		# zombie_net.save(sess, verbose=True, name=ZOMBIE_MODEL_NAME+"")
		# explore_net.save(sess, verbose=True, name=EXPLORE_MODEL_NAME+"")
		# extra_net.save(sess, verbose=True, name=EXTRA_MODEL_NAME+"")

		# save_path = saver.save(sess, MODEL_PATH_SAVE)
		# print("Model saved in path: %s" % save_path)

		writer.close()
		histogram.plot()
Пример #8
0
					  local_size = LOCAL_GRID_SIZE,
					  rate = 80,
					  max_time = 120,
					  food_count = 20,
					  stick_count = 0,
					  obstacle_count = 0,
					  lava_count = 0,
					  zombie_count = 0,
					  history = 0,
					  action_space = 5,
					  map_path = MAP_PATH)

	if RENDER_TO_SCREEN:
		env.prerender()

	model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True, path="./Models/Tensorflow/"+FOLDER+"/")

	brain = Brain(epsilon=0.1, action_space = env.number_of_actions())

	model.setup(brain)

	score = tf.placeholder(tf.float32, [])
	avg_t = tf.placeholder(tf.float32, [])
	epsilon = tf.placeholder(tf.float32, [])
	avg_r = tf.placeholder(tf.float32, [])

	tf.summary.scalar('error', tf.squeeze(model.error))
	tf.summary.scalar('score', score)
	tf.summary.scalar('average time', avg_t)
	tf.summary.scalar('epsilon', epsilon)
	tf.summary.scalar('avg reward', avg_r)
Пример #9
0
# model_type = 'dqn'
model_path = "policies/22-1-2021_13-44/policy0.tar"

env = LunarLander()
env.reset()
exit_program = False

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

if model_type == 'policy':
    model = Policy(env.observation_dim, env.action_dim)
elif model_type == 'dqn':
    model = Network(env.observation_dim, env.action_dim)
model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()
state = env.get_state()

while not exit_program:
    env.render()
    action = model(
        torch.tensor(state, dtype=torch.float32,
                     device=device).unsqueeze(0)).argmax()

    state, reward, done = env.step(action)

    # Process game events
    for event in pygame.event.get():
def main(env, args):
    # Fix random seeds and number of threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    if args.recodex:
        model = load_vae()
        best_params = np.load('best_params.npy', allow_pickle=True)
        # TODO: Perform evaluation of a trained model.
        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                # env.render()
                # TODO: Choose an action
                action = decide_action(model, state, best_params)
                state, reward, done, _ = env.step(action)

    elif args.DQN:
        network = Network(env, args)
        if os.path.exists('dqn.model'):
            network.model = tf.keras.models.load_model('dqn.model')
        vae = load_vae()
        replay_buffer = collections.deque(maxlen=100000)
        Transition = collections.namedtuple(
            "Transition", ["state", "action", "reward", "done", "next_state"])

        epsilon = 0.25
        gamma = 1

        for i in tqdm(range(10000)):
            state, done = env.reset(), False
            while not done:
                embedding = vae.get_latent_representation(np.array([state]))

                q_values = network.predict(embedding)[0]

                if np.random.uniform() >= epsilon:
                    action = np.argmax(q_values)
                else:
                    action = np.random.randint(0, env.action_space.n)

                next_state, reward, done, _ = env.step(action)

                replay_buffer.append(
                    Transition(
                        embedding, action, reward, done,
                        vae.get_latent_representation(np.array([next_state]))))

                if len(replay_buffer) > 32:
                    minibatch = random.sample(replay_buffer, 32)

                    states = np.array([t.state[0] for t in minibatch])
                    actions = np.array([t.action for t in minibatch])
                    rewards = np.array([t.reward for t in minibatch])
                    dones = np.array([t.done for t in minibatch])
                    next_states = np.array(
                        [t.next_state[0] for t in minibatch])

                    q_values = np.array(network.predict(states))
                    q_values_next = network.predict(next_states)

                    for Q, action, reward, next_Q, is_done in zip(
                            q_values, actions, rewards, q_values_next, dones):
                        Q[action] = reward + (0 if is_done else gamma *
                                              np.max(next_Q))

                    network.train(states, q_values)

                    if i % 100 == 0:
                        network.update_target_weights()

                    if i % 100 == 0:
                        network.save()

                state = next_state

            epsilon = np.exp(
                np.interp(env.episode + 1, [0, 5000],
                          [np.log(0.25), np.log(0.01)]))

    elif args.evolution:
        es = train(load_from='saved_model.pkl')
        np.save('best_params', es.best.get()[0])
        best_params = es.best.get()[0]
        play(best_params, render=True)
Пример #11
0
def train():

    MODEL_NAME = "diamond9_input5"
    MODEL_NAME_save = "diamond9_input5"

    FOLDER = "Best_Dojos9"

    MODEL_PATH_SAVE = "./Models/Tensorflow/" + FOLDER + "/" + MODEL_NAME_save + "/" + MODEL_NAME_save + ".ckpt"

    LOGDIR = "./Logs/" + FOLDER + "/" + MODEL_NAME_save + "_2"

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 8
    LOCAL_GRID_SIZE = 9
    MAP_NUMBER = 0
    RANDOMIZE_MAPS = False

    # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
    MAP_PATH = None

    print("\n ---- Training the Deep Neural Network ----- \n")

    RENDER_TO_SCREEN = False
    # RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=50,
                      food_count=10,
                      obstacle_count=0,
                      lava_count=0,
                      zombie_count=0,
                      history=0,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = Network(local_size=LOCAL_GRID_SIZE,
                    name=MODEL_NAME,
                    load=False,
                    path="./Models/Tensorflow/" + FOLDER + "/")

    brain = Brain(epsilon=0.1, action_space=env.number_of_actions())

    model.setup(brain)

    score = tf.placeholder(tf.float32, [])
    avg_t = tf.placeholder(tf.float32, [])
    epsilon = tf.placeholder(tf.float32, [])
    avg_r = tf.placeholder(tf.float32, [])

    tf.summary.scalar('error', tf.squeeze(model.error))
    tf.summary.scalar('score', score)
    tf.summary.scalar('average time', avg_t)
    tf.summary.scalar('epsilon', epsilon)
    tf.summary.scalar('avg reward', avg_r)

    avg_time = 0
    avg_score = 0
    avg_error = 0
    avg_reward = 0
    cumulative_reward = 0

    # Number of episodes
    print_episode = 100
    total_episodes = 10000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # Assume that you have 12GB of GPU memory and want to allocate ~4GB:
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")
        else:
            sess.run(init)

        # for episode in range(50):
        # 	state, info = env.reset()
        # 	done = False

        # 	if RENDER_TO_SCREEN:
        # 		env.render()

        # 	while not done:
        # 		action = brain.choose_action(state, sess, model)

        # 		new_state, reward, done, info = env.step(action)

        # 		brain.store_transition(state, action, reward, done, new_state)

        # 		state = new_state

        # 		if RENDER_TO_SCREEN:
        # 			env.render()

        # print("\nREPLAY MEMORY INITIALISED")
        # print(brain.memCntr)

        writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):

            if RANDOMIZE_MAPS:
                MAP_PATH = "./Maps/Grid10/map{}.txt".format(
                    np.random.randint(10))
                env.set_map(MAP_PATH)

            state, info = env.reset()
            done = False

            # brain.linear_epsilon_decay(total_episodes, episode, start=0.4, end=0.05, percentage=0.8)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                action = brain.choose_action(state, sess, model)
                # print(action)

                # Update environment by performing action
                new_state, reward, done, info = env.step(action)
                # print(new_state)

                brain.store_transition(state, action, reward, done, new_state)

                # e, Q_vector = brain.train_batch(4, model, sess)

                e, Q_vector = brain.train(model, sess)

                state = new_state

                cumulative_reward += reward

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e
                    avg_reward += cumulative_reward
                    cumulative_reward = 0

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print(
                    "Ep:",
                    episode,
                    "\tavg t: {0:.3f}".format(avg_time / print_episode),
                    "\tavg score: {0:.3f}".format(avg_score / print_episode),
                    "\tErr {0:.3f}".format(avg_error / print_episode),
                    "\tavg_reward {0:.3f}".format(
                        avg_reward / print_episode),  # avg cumulative reward
                    "\tepsilon {0:.3f}".format(brain.EPSILON),
                    end="")
                print_readable_time(current_time)

                # Save the model's weights and biases to .npz file
                model.save(sess, name=MODEL_NAME_save)
                # save_path = saver.save(sess, MODEL_PATH_SAVE)

                s = sess.run(merged_summary,
                             feed_dict={
                                 model.input: state,
                                 model.actions: Q_vector,
                                 score: avg_score / print_episode,
                                 avg_t: avg_time / print_episode,
                                 epsilon: brain.EPSILON,
                                 avg_r: avg_reward / print_episode
                             })
                writer.add_summary(s, episode)

                avg_time = 0
                avg_score = 0
                avg_error = 0
                avg_reward = 0

        model.save(sess, verbose=True, name=MODEL_NAME_save)

        # save_path = saver.save(sess, MODEL_PATH_SAVE)
        # print("Model saved in path: %s" % save_path)

        writer.close()
Пример #12
0
def train():

    MODEL_NAME = "diamond_local15_maps"

    MODEL_PATH_SAVE = "./Models/Tensorflow/Maps/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt"

    LOGDIR = "./Logs/" + MODEL_NAME

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 10
    LOCAL_GRID_SIZE = 15
    MAP_NUMBER = 0
    RANDOMIZE_MAPS = True

    # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER)
    MAP_PATH = None

    print("\n ---- Training the Deep Neural Network ----- \n")

    RENDER_TO_SCREEN = False
    RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=50,
                      food_count=3,
                      obstacle_count=1,
                      lava_count=1,
                      zombie_count=0,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = Network(local_size=LOCAL_GRID_SIZE,
                    name=MODEL_NAME,
                    load=False,
                    path="./Models/Tensorflow/Maps/")

    brain = Brain(epsilon=0.05, action_space=env.number_of_actions())

    model.setup(brain)

    tf.summary.scalar('error', tf.squeeze(model.error))

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # Number of episodes
    print_episode = 1000
    total_episodes = 100000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    # writer = tf.summary.FileWriter(LOGDIR)

    # Assume that you have 12GB of GPU memory and want to allocate ~4GB:
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")

        sess.run(init)

        # writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):

            if RANDOMIZE_MAPS:
                # Make a random map 0: lava, 1: obstacle
                MAP_PATH = "./Maps/Grid10/map{}.txt".format(
                    np.random.randint(10))
                env.set_map(MAP_PATH)

            state, info = env.reset()
            done = False

            brain.linear_epsilon_decay(total_episodes,
                                       episode,
                                       start=0.5,
                                       end=0.05,
                                       percentage=0.6)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                # Retrieve the Q values from the NN in vector form
                # Q_vector = sess.run(model.q_values, feed_dict={model.input: state})

                action = brain.choose_action(state, sess, model)

                # print(action)

                # Update environment by performing action
                new_state, reward, done, info = env.step(action)

                # print(new_state)

                brain.store_transition(state, action, reward, done, new_state)

                e = brain.train(model, sess)

                state = new_state

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print("Ep:",
                      episode,
                      "\tavg t: {0:.3f}".format(avg_time / print_episode),
                      "\tavg score: {0:.3f}".format(avg_score / print_episode),
                      "\tErr {0:.3f}".format(avg_error / print_episode),
                      "\tepsilon {0:.3f}".format(brain.EPSILON),
                      end="")
                print_readable_time(current_time)

                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npz file
                model.save(sess)
                save_path = saver.save(sess, MODEL_PATH_SAVE)

                # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector})
                # writer.add_summary(s, episode)

        model.save(sess, verbose=True)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)
Пример #13
0
def train_MetaNetwork():

    print("\n ---- Training the Meta Network ----- \n")

    MODEL_NAME = "meta_network_local15"
    DIAMOND_MODEL_NAME = "diamond_dojo_local15"
    ZOMBIE_MODEL_NAME = "zombie_dojo_local15"
    # EXPLORE_MODEL_NAME = "explore_dojo_local15"

    MODEL_PATH_SAVE = "./Models/Tensorflow/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt"

    LOGDIR = "./Logs/" + MODEL_NAME

    USE_SAVED_MODEL_FILE = False

    GRID_SIZE = 8
    LOCAL_GRID_SIZE = 15
    MAP_PATH = None

    RENDER_TO_SCREEN = False
    # RENDER_TO_SCREEN = True

    env = Environment(wrap=False,
                      grid_size=GRID_SIZE,
                      local_size=LOCAL_GRID_SIZE,
                      rate=80,
                      max_time=200,
                      food_count=3,
                      obstacle_count=0,
                      lava_count=0,
                      zombie_count=1,
                      action_space=5,
                      map_path=MAP_PATH)

    if RENDER_TO_SCREEN:
        env.prerender()

    model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True)

    diamond_net = Network(local_size=LOCAL_GRID_SIZE,
                          name=DIAMOND_MODEL_NAME,
                          load=True,
                          trainable=False)

    zombie_net = Network(local_size=LOCAL_GRID_SIZE,
                         name=ZOMBIE_MODEL_NAME,
                         load=True,
                         trainable=False)

    # explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, load=True, trainable = False)

    brain = Brain(epsilon=0.01, action_space=2)

    model.setup(brain)
    diamond_net.setup(brain)
    zombie_net.setup(brain)
    # explore_net.setup(brain)

    tf.summary.scalar('error', tf.squeeze(model.error))

    avg_time = 0
    avg_score = 0
    avg_error = 0

    # Number of episodes
    print_episode = 1000
    total_episodes = 100000

    saver = tf.train.Saver()

    # Initialising all variables (weights and biases)
    init = tf.global_variables_initializer()

    # Adds a summary graph of the error over time
    merged_summary = tf.summary.merge_all()

    # Tensorboard capabilties
    writer = tf.summary.FileWriter(LOGDIR)

    # GPU capabilities
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)

    # Begin session
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        if USE_SAVED_MODEL_FILE:
            saver.restore(sess, MODEL_PATH_SAVE)
            print("Model restored.")
        else:
            sess.run(init)

        writer.add_graph(sess.graph)

        start_time = time.time()

        print("")

        for episode in range(total_episodes):
            state, info = env.reset()
            done = False

            brain.linear_epsilon_decay(total_episodes,
                                       episode,
                                       start=0.3,
                                       end=0.02,
                                       percentage=0.5)

            # brain.linear_alpha_decay(total_episodes, episode)

            if RENDER_TO_SCREEN:
                env.render()

            while not done:

                # Retrieve the Q values from the NN in vector form
                Dojo_vector = sess.run(model.q_values,
                                       feed_dict={model.input: state})

                dojo = brain.choose_action(state, sess, model)

                # print(dojo)

                if dojo == 0:
                    # state[2] = 0 # Zero out the zombies layer
                    state = np.delete(state, 2, 0)  # Take out the zombie layer
                    state = np.delete(state, 5,
                                      0)  # Take out the history layer
                    action = brain.choose_dojo(state, sess, diamond_net,
                                               env.number_of_actions(), 0.01)
                elif dojo == 1:
                    # state[1] = 0 # Zero out the diamond layer
                    state = np.delete(state, 1,
                                      0)  # Take out the diamond layer
                    state = np.delete(state, 5,
                                      0)  # Take out the history layer
                    action = brain.choose_dojo(state, sess, zombie_net,
                                               env.number_of_actions(), 0.01)
                elif dojo == 2:
                    state = np.delete(state, 1,
                                      0)  # Take out the diamond layer
                    state = np.delete(state, 2, 0)  # Take out the zombie layer
                    action = brain.choose_dojo(state, sess, explore_net,
                                               env.number_of_actions(), 0.01)

                # print(action)

                # Update environment with by performing action
                new_state, reward, done, info = env.step(action)

                # print(new_state)

                brain.store_transition(state, dojo, reward, done, new_state)

                ## Standard training with learning after every step

                # print(tf.trainable_variables(scope=None))

                if done:
                    Dojo_vector[:, dojo] = reward
                    # print("Reward:", reward)
                else:
                    # Gathering the now current state's action-value vector
                    y_prime = sess.run(model.q_values,
                                       feed_dict={model.input: new_state})

                    # Equation for training
                    maxq = sess.run(model.y_prime_max,
                                    feed_dict={model.actions: y_prime})

                    # RL Equation
                    Dojo_vector[:, dojo] = reward + (brain.GAMMA * maxq)

                _, e = sess.run([model.optimizer, model.error],
                                feed_dict={
                                    model.input: state,
                                    model.actions: Dojo_vector
                                })

                ## Training using replay memory

                state = new_state

                if RENDER_TO_SCREEN:
                    env.render()

                if done:
                    avg_time += info["time"]
                    avg_score += info["score"]
                    avg_error += e

            if (episode % print_episode == 0
                    and episode != 0) or (episode == total_episodes - 1):

                current_time = math.floor(time.time() - start_time)
                print("Ep:",
                      episode,
                      "\tavg t: {0:.3f}".format(avg_time / print_episode),
                      "\tavg score: {0:.3f}".format(avg_score / print_episode),
                      "\tErr {0:.3f}".format(avg_error / print_episode),
                      "\tepsilon {0:.3f}".format(brain.EPSILON),
                      end="")
                print_readable_time(current_time)

                avg_time = 0
                avg_score = 0
                avg_error = 0

                # Save the model's weights and biases to .npz file
                model.save(sess)
                save_path = saver.save(sess, MODEL_PATH_SAVE)

                s = sess.run(merged_summary,
                             feed_dict={
                                 model.input: state,
                                 model.actions: Dojo_vector
                             })
                writer.add_summary(s, episode)

        model.save(sess, verbose=True)

        save_path = saver.save(sess, MODEL_PATH_SAVE)
        print("Model saved in path: %s" % save_path)

        writer.close()