def play(sess,
         agent,
         no_plays,
         log_dir=None,
         show_ui=False,
         show_action=False):
    """
    Use a trained agent to play a required number of games
    :param sess: op, session instance from tensorflow
    :param agent: tensor, trained agent structure/graph
    :param no_plays: int, you get it
    :param log_dir: string, place to store the log files during gameplay
    :param show_ui: bool, True  -> Show game screen
                          False -> Should I explain this?
    :param show_action: bool, True  -> Show the actions taken by the trained agent
                              False -> Hmm, what can this be?
    :return: just prints the results with nothing being returned
    """
    rewards = []
    for p in range(no_plays):
        observation = env.reset()
        observation = ops.convert_to_gray_n_resize(observation)
        observation = np.expand_dims(observation, axis=2)
        state = np.repeat(observation, 4, axis=2)
        state = np.expand_dims(state, axis=0)
        done = False
        reward = 0
        while not done:
            if show_ui:
                env.render()
            if np.random.rand() < 0.05:
                action = env.action_space.sample()
            else:
                action = np.argmax(sess.run(agent, feed_dict={X_input: state}))
            if show_action:
                print(action)
            new_state, r, done, _ = env.step(action)
            next_state = ops.convert_to_gray_n_resize(new_state)
            next_state = np.expand_dims(next_state, axis=2)
            next_state = np.expand_dims(next_state, axis=0)
            state = np.append(next_state, state[:, :, :, :3], axis=3)
            reward += r
        rewards.append(reward)
        print("Game: {}/{}".format(p + 1, no_plays))
        print("Reward: {}".format(reward))
        if not log_dir is None:
            with open(log_dir + "/log.txt", "a") as log_file:
                log_file.write("Game: {}/{}\n".format(p + 1, no_plays))
                log_file.write("Reward: {}\n".format(reward))
    print(
        "------------------------------------------------------------------------------------------------------"
    )
    print("Best reward: {}".format(np.amax(rewards)))
    print("Average reward: {}".format(np.mean(rewards)))
    if not log_dir is None:
        with open(log_dir + "/log.txt", "a") as log_file:
            log_file.write("Best reward: {}\n".format(np.amax(rewards)))
            log_file.write("Average reward: {}\n".format(np.mean(rewards)))
def collect_rand_observations(replay_memory, sess=None, agent=None):
    """
    Collects mc.rand_observation_time number of random observations and stores them in deque
    :param replay_memory: deque, deque instance
    :param agent: Tensor op, the agent architecture
    :param sess: op, the restored session to restore
    :return: ndarray, stored as follows:
                      (state, action, reward, next_states, done, life_lost)
    """
    print("Collecting Random Observations")
    observation = env.reset()
    observation = ops.convert_to_gray_n_resize(observation)
    observation = np.expand_dims(observation, axis=2)
    state = np.repeat(observation, 4, axis=2)
    state = np.expand_dims(state, axis=0)
    lives_left = 5
    if len(replay_memory) < mc.rand_observation_time:
        for i in range(int(mc.rand_observation_time)):
            if sess is None:
                action = env.action_space.sample()
            else:
                q_prediction = sess.run(agent, feed_dict={X_input: state})
                action = np.argmax(q_prediction)
            next_state, reward, done, info = env.step(action)
            next_state = ops.convert_to_gray_n_resize(next_state)
            next_state = np.expand_dims(next_state, axis=2)
            next_state = np.expand_dims(next_state, axis=0)
            next_states = np.append(next_state, state[:, :, :, :3], axis=3)
            life_lost = 0
            if lives_left - info['ale.lives'] > 0:
                life_lost = 1
                lives_left -= 1
            replay_memory.append(
                (state, action, reward, next_states, done, life_lost))
            state = next_states
            if done:
                lives_left = 5
                observation = env.reset()
                observation = ops.convert_to_gray_n_resize(observation)
                observation = np.expand_dims(observation, axis=2)
                state = np.repeat(observation, 4, axis=2)
                state = np.expand_dims(state, axis=0)
            print("\rRandom Observation: {}/{}".format(
                i + 1, mc.rand_observation_time),
                  end="")
            sys.stdout.flush()
    return replay_memory
예제 #3
0
    def training_data(self):
        # TODO: Remove the first 5 to 10 frames from each episode?
        train_input = []
        train_action = []
        train_target = []

        episode_dir = sorted([
            self.data_dir + "/train/" + p
            for p in os.listdir(self.data_dir + "/train/")
        ])
        n_episodes = len(episode_dir)
        print("Reading training images!")
        for e_i, episode in enumerate(episode_dir):
            print("Reading training image from episode: {}/{}".format(
                e_i + 1, n_episodes))
            frames = sorted(
                [f for f in os.listdir(episode) if f.endswith(".png")])
            with open(episode + "/action.txt") as action_file:
                action_log = action_file.read()

            train_action.extend(
                [int(a) for i, a in enumerate(action_log.split("\n")[3:-1])])

            # TODO: Using this for grayscale images only
            for f_indx in range(len(frames)):
                frames_to_use = frames[f_indx:f_indx + 5]
                if len(frames_to_use) < 5:
                    continue
                for i, f in enumerate(frames_to_use):
                    img = ops.convert_to_gray_n_resize(
                        np.array(Image.open(episode + "/" + f)))
                    img = np.expand_dims(img, axis=2)
                    if i == 0:
                        train_frames = img.copy()
                    elif i < 4:
                        train_frames = np.append(train_frames, img, axis=2)
                    else:
                        train_target.append(img)
                train_input.append(train_frames)
        print("Input dataset constructed")
        train_input = np.array(train_input).reshape(
            [-1, 84, 84, 4])  # the last 4 appended frames are useless
        train_action = np.array(train_action).reshape([-1, 1])
        train_target = np.array(train_target).reshape([-1, 84, 84, 1])

        return train_input, train_action, train_target
예제 #4
0
def play_n_collect(sess, agent, no_plays, log_dir=None, show_ui=False, show_action=False):
    """
    Use a trained agent to play a required number of games
    :param sess: op, session instance from tensorflow
    :param agent: tensor, trained agent structure/graph
    :param no_plays: int, you get it
    :param log_dir: string, place to store the log files during gameplay
    :param show_ui: bool, True  -> Show game screen
                          False -> Should I explain this?
    :param show_action: bool, True  -> Show the actions taken by the trained agent
                              False -> Hmm, what can this be?
    :return: just prints the results with nothing being returned
    """
    rewards = []
    main_dir, train_dir, test_dir = make_directories()
    step = 0
    for p in range(no_plays):
        frame = 0
        observation = env.reset()
        if p < 1000:
            # Save the first image
            episode_path = train_dir + "/{:05d}".format(p)
        else:
            episode_path = test_dir + "/{:05d}".format(p % 1000)
        os.mkdir(episode_path)
        plt.imsave(arr=observation, fname=episode_path + "/{:06d}.png".format(frame))

        observation = ops.convert_to_gray_n_resize(observation)
        observation = np.expand_dims(observation, axis=2)
        state = np.repeat(observation, 4, axis=2)
        state = np.expand_dims(state, axis=0)
        done = False
        reward = 0
        while not done:
            if show_ui:
                env.render()
            if np.random.rand() < 0.07:
                action = env.action_space.sample()
            else:
                action = np.argmax(sess.run(agent, feed_dict={X_input: state}))

            # Save the action taken
            with open(episode_path + "/action.txt", "a") as log:
                log.write("{}\n".format(action))

            if show_action:
                print(action)
            frame += 1
            step += 1
            new_state, r, done, _ = env.step(action)
            plt.imsave(arr=new_state, fname=episode_path + "/{:06d}.png".format(frame))
            next_state = ops.convert_to_gray_n_resize(new_state)
            next_state = np.expand_dims(next_state, axis=2)
            next_state = np.expand_dims(next_state, axis=0)
            state = np.append(next_state, state[:, :, :, :3], axis=3)
            reward += r
        rewards.append(reward)
        print("Step: {}/500e3".format(step))
        print("Game: {}/{}".format(p + 1, no_plays))
        print("Reward: {}\n".format(reward))
        if not log_dir is None:
            with open(log_dir + "/log.txt", "a") as log_file:
                log_file.write("Game: {}/{}\n".format(p + 1, no_plays))
                log_file.write("Reward: {}\n".format(reward))
    print("------------------------------------------------------------------------------------------------------")
    print("Best reward: {}".format(np.amax(rewards)))
    print("Average reward: {}".format(np.mean(rewards)))
    if not log_dir is None:
        with open(log_dir + "/log.txt", "a") as log_file:
            log_file.write("Best reward: {}\n".format(np.amax(rewards)))
            log_file.write("Average reward: {}\n".format(np.mean(rewards)))
def train(train_model=True):
    """
    Trains the agent with hyperparameters and other info loaded from mission_control_<game>.py file
    :param train_model: bool, True  -> Trains the agent
                              False -> Loads the LATEST trained agent and plays
    :return: absolutely nothing
    """
    with tf.variable_scope("Action_agent"):
        agent = get_agent(X_input)

    with tf.variable_scope("Target_agent"):
        target_agent = get_agent(X_input)

    loss = tf.losses.mean_squared_error(labels=Y_target, predictions=agent)

    var_list = tf.trainable_variables()
    agent_vars = [t for t in var_list if t.name.startswith("Action_agent")]

    optimizer = tf.train.RMSPropOptimizer(learning_rate=mc.learning_rate,
                                          momentum=mc.momentum,
                                          epsilon=mc.epsilon).minimize(
                                              loss, var_list=agent_vars)

    # Create the summary for tensorboard
    # TODO: Plot the rewards per episode
    tf.summary.scalar(name='loss', tensor=loss)
    tf.summary.scalar(name='max_q_value', tensor=tf.reduce_max(
        agent))  # TODO: Replace this to the op in the paper
    tf.summary.histogram(name='q_values_hist', values=agent)

    # TODO: Plot the length of each episode
    # TODO: Plot the argmax of the action taken for each play

    saver = tf.train.Saver()
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        if train_model:
            print("Training agent!")
            print("Preparing required directories")

            # Initialize global variables
            sess.run(init)

            # Used to measure time taken
            t1 = time.time()

            # Kinda like the global step, but is not a "Tensor"
            step = 0

            # Get the initial epsilon
            prob_rand = mc.prob_random

            # TODO: Change this ASAP
            # Add epsilon to Tensorboard
            tf.summary.scalar('epsilon', tensor=prob_rand)
            summary_op = tf.summary.merge_all()

            replay_memory = deque()

            if mc.load_trained_model:
                saved_models = os.listdir(mc.logdir)
                latest_saved_model = sorted(saved_models)[-1]
                saver.restore(
                    sess,
                    tf.train.latest_checkpoint(mc.logdir + latest_saved_model +
                                               "/saved_models/"))
                with open(
                        mc.logdir + latest_saved_model +
                        "/saved_models/checkpoint", 'r') as checkout_file:
                    line_1 = checkout_file.readline()
                    step = int(line_1[30:-2])
                tensorboard_dir = mc.logdir + latest_saved_model + "/Tensorboard/"
                saved_model_dir = mc.logdir + latest_saved_model + "/saved_models/"
                log_dir = mc.logdir + latest_saved_model + "/logs/"

                replay_memory = collect_rand_observations(
                    replay_memory, sess, agent)
            else:
                replay_memory = collect_rand_observations(
                    replay_memory)  # Get the initial 50k random observations

            if not mc.load_trained_model:
                tensorboard_dir, saved_model_dir, log_dir = make_directories(
                    mc.logdir)

            print("Tensorboard files stores in: {}".format(tensorboard_dir))
            print("Saved models stored in: {}".format(saved_model_dir))
            print("Log files stores in: {}".format(log_dir))

            # File writer for tensorboard
            writer = tf.summary.FileWriter(logdir=tensorboard_dir,
                                           graph=sess.graph)

            game_rewards = []

            # Save current mission control file
            with open("mission_control_breakout.py", "r") as mc_file:
                mission_control_file = mc_file.read()
                with open(log_dir + "/mission_control.txt", "w") as mc_writer:
                    mc_writer.write(mission_control_file)

            for e in range(mc.n_episodes):
                with open(log_dir + "/log.txt", "a") as log_file:
                    log_file.write(
                        "--------------------------Episode: {}/{}------------------------------\n"
                        .format(e + 1, mc.n_episodes))
                print(
                    "--------------------------Episode: {}/{}------------------------------\n"
                    .format(e + 1, mc.n_episodes))
                # Prepare first observation
                observation = env.reset()
                observation = ops.convert_to_gray_n_resize(observation)
                observation = np.expand_dims(observation, axis=2)
                state = np.repeat(observation, 4, axis=2)
                state = np.expand_dims(state, axis=0)

                # TODO: Only for breakout
                lives_left = 5
                log_q_values = []
                episode_rewards = []
                for t in itertools.count():
                    mini_batch = random.sample(replay_memory, mc.batch_size)

                    agent_input = []
                    agent_target = []
                    for s in range(len(mini_batch)):
                        state_ = mini_batch[s][0]
                        action_ = mini_batch[s][1]
                        reward_ = mini_batch[s][2]
                        next_state_ = mini_batch[s][3]
                        done_ = mini_batch[s][4]
                        life_lost = mini_batch[s][5]

                        agent_input.append(state_[0])
                        target = sess.run(target_agent,
                                          feed_dict={X_input: state_})
                        if done_ or life_lost == 1:
                            target[0, action_] = reward_
                            agent_target.append(target[0])
                        else:
                            agent_output = sess.run(
                                target_agent, feed_dict={X_input: next_state_})
                            target[0, action_] = reward_ + mc.gamma * (
                                np.amax(agent_output))
                            agent_target.append(target[0])

                    # Training the agent for 1 iterations. Finally!!
                    for i in range(mc.fit_epochs):
                        sess.run(optimizer,
                                 feed_dict={
                                     X_input: agent_input,
                                     Y_target: agent_target
                                 })

                    # Copy trained parameters from the agent to the target network
                    if (step + 1) % mc.target_network_update == 0:
                        copy_parameters(sess)

                    l, summary = sess.run([loss, summary_op],
                                          feed_dict={
                                              X_input: agent_input,
                                              Y_target: agent_target
                                          })
                    writer.add_summary(summary, global_step=step)

                    print("\rStep: {} ({}), Episode: {}/{}, Loss: {}".format(
                        t, step, e + 1, mc.n_episodes, l),
                          end="")
                    sys.stdout.flush()

                    # Collect the next observation
                    if np.random.rand() < prob_rand:
                        action = env.action_space.sample()
                    else:
                        q_prediction = sess.run(agent,
                                                feed_dict={X_input: state})
                        action = np.argmax(q_prediction)
                        log_q_values.extend(q_prediction)
                    next_state, reward, done, info = env.step(action)
                    next_state = ops.convert_to_gray_n_resize(next_state)
                    next_state = np.expand_dims(next_state, axis=2)
                    next_state = np.expand_dims(next_state, axis=0)
                    next_states = np.append(next_state,
                                            state[:, :, :, :3],
                                            axis=3)

                    life_lost = 0
                    if lives_left - info['ale.lives'] > 0:
                        life_lost = 1
                        lives_left -= 1

                    # Remove old samples from replay memory if it's full
                    if len(replay_memory) > mc.observation_time:
                        replay_memory.popleft()

                    replay_memory.append(
                        (state, action, reward, next_states, done, life_lost))
                    state = next_states
                    episode_rewards.append(reward)
                    step += 1

                    if (step + 1) % 10000 == 0:
                        # Save the agent
                        saved_path = saver.save(sess,
                                                saved_model_dir + '/model',
                                                global_step=step)

                    prob_rand = anneal_epsilon(step)

                    if mc.show_ui:
                        env.render()

                    if done:
                        break

                with open(log_dir + "/log.txt", "a") as log_file:
                    log_file.write(
                        "Step: {} ({}), Play: {}/{}, Loss: {}\n".format(
                            t, step, e + 1, mc.n_episodes, l))
                    log_file.write("Reward Obtained: {}\n".format(
                        np.sum(episode_rewards)))
                    game_rewards.append(np.sum(episode_rewards))
                    x_val = np.arange(e + 1)
                    plt.plot(x_val, game_rewards)
                    plt.xlabel("Episode")
                    plt.ylabel("Reward Obtained")
                    plt.savefig("{}/Rewards.png".format(log_dir))
                    plt.close()

                    if log_q_values != []:
                        log_file.write("Average Q Value: {}\n".format(
                            np.mean(log_q_values)))
                    else:
                        log_file.write("All of the actions were random\n")

                print("\nReward Obtained: {}".format(np.sum(episode_rewards)))

                if log_q_values != []:
                    print("Average Q Value: {}".format(np.mean(log_q_values)))
                else:
                    print("All of the actions were random")

            print("Time taken of {} Plays on your potato: {:.4f}s".format(
                mc.n_episodes,
                time.time() - t1))
            print("Average time for each Play: {:.4f}s".format(
                (time.time() - t1) / mc.n_episodes))
            print("Tensorboard files saved in: {}".format(tensorboard_dir))
            print("Model saved in: {}".format(saved_path))
            print(
                "Model parameters stored in: {}".format(log_dir +
                                                        "mission_control.txt"))
            print("Agent get to roll!")
            with open(log_dir + "/log.txt", "a") as log_file:
                log_file.write(
                    "Time taken of {} episodes on your potato: {:.4f}s\n".
                    format(mc.n_episodes,
                           time.time() - t1))
                log_file.write(
                    "Average time for each episode: {:.4f}s\n".format(
                        (time.time() - t1) / mc.n_episodes))
        else:
            # Get the latest trained model
            saved_models = os.listdir(mc.logdir)
            latest_saved_model = sorted(saved_models)[-1]
            saver.restore(
                sess,
                tf.train.latest_checkpoint(mc.logdir + latest_saved_model +
                                           "/saved_models/"))
            print("Getting model from: {}".format(mc.logdir +
                                                  latest_saved_model +
                                                  "/saved_models/"))
            print(
                "------------------------Playing----------------------------")
            play(sess=sess,
                 agent=agent,
                 no_plays=mc.n_episodes,
                 log_dir=None,
                 show_ui=mc.show_ui,
                 show_action=mc.show_action)