Exemplo n.º 1
0
    def train():
        """Contains the training and evaluation loops"""
        my_replay_memory = ReplayMemory(size=Trainer.MEMORY_SIZE,
                                        batch_size=Trainer.BATCH_SIZE)  # (★)

        eps_sched = EpsScheduler(
            replay_memory_start_size=Trainer.REPLAY_MEMORY_START_SIZE,
            max_frames=Trainer.MAX_FRAMES)

        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            sess.run(init)

            self.global_frame_index = 0
            self.rewards = []
            self.loss_list = []

            while self.global_frame_index < Trainer.MAX_FRAMES:
                self.epoch_frame_index = 0
                while self.epoch_frame_index < Trainer.EPOCH_FRAME_COUNT:
                    episode_done = self.game.reset(sess)
                    episode_reward_sum = 0
                    for _ in range(Trainer.MAX_EPISODE_LENGTH):
                        action = networks.main_dqn.get_action(
                            sess,
                            global_frame_index,
                            game.state,
                            evaluation=False)
                        processed_new_frame, reward, terminal, episode_done, _ = game.step(
                            sess, action)
                        global_frame_index += 1
                        epoch_frame_index += 1
                        episode_reward_sum += reward

                        # Clip the reward
                        clipped_reward = Trainer.clip_reward(reward)

                        # (7★) Store transition in the replay memory
                        my_replay_memory.add_experience(
                            action=action,
                            frame=processed_new_frame[:, :, 0],
                            reward=clipped_reward,
                            terminal=episode_done)

                        if global_frame_index % Trainer.UPDATE_FREQ == 0 and global_frame_index > Trainer.REPLAY_MEMORY_START_SIZE:
                            loss = Trainer.learn(
                                sess,
                                my_replay_memory,
                                networks,
                                Trainer.BATCH_SIZE,
                                gamma=Trainer.DISCOUNT_FACTOR)  # (8★)
                            loss_list.append(loss)
                        if global_frame_index % Trainer.NETW_UPDATE_FREQ == 0 and global_frame_index > Trainer.REPLAY_MEMORY_START_SIZE:
                            networks.update_target_network(sess)  # (9★)

                        if terminal:
                            terminal = False
                            break

                    rewards.append(episode_reward_sum)

                    # Output the progress:
                    if len(rewards) % 10 == 0:
                        # Scalar summaries for tensorboard
                        if global_frame_index > REPLAY_MEMORY_START_SIZE:
                            summ = sess.run(performance_summaries,
                                            feed_dict={
                                                loss_ph: np.mean(loss_list),
                                                reward_ph:
                                                np.mean(rewards[-100:])
                                            })

                            SUMM_WRITER.add_summary(summ, global_frame_index)
                            loss_list = []
                        # Histogramm summaries for tensorboard
                        summ_param = sess.run(param_summaries)
                        SUMM_WRITER.add_summary(summ_param, global_frame_index)

                        print(len(rewards), global_frame_index,
                              np.mean(rewards[-100:]))
                        with open('rewards.dat', 'a') as reward_file:
                            print(len(rewards),
                                  global_frame_index,
                                  np.mean(rewards[-100:]),
                                  file=reward_file)

                ########################
                ###### Evaluation ######
                ########################
                terminal = True
                gif = True
                frames_for_gif = []
                eval_rewards = []
                evaluate_frame_number = 0

                for _ in range(EVAL_STEPS):
                    if terminal:
                        episode_done = game.reset(sess, evaluation=True)
                        episode_reward_sum = 0
                        terminal = False

                    # Fire (action 1), when a life was lost or the game just started,
                    # so that the agent does not stand around doing nothing. When playing
                    # with other environments, you might want to change this...
                    action = 1 if episode_done else \
                        networks.main_dqn.get_action(sess, global_frame_index, game.state, evaluation=True)

                    processed_new_frame, reward, terminal, episode_done, new_frame = game.step(
                        sess, action)
                    evaluate_frame_number += 1
                    episode_reward_sum += reward

                    if gif:
                        frames_for_gif.append(new_frame)
                    if terminal:
                        eval_rewards.append(episode_reward_sum)
                        gif = False  # Save only the first game of the evaluation as a gif

                print("Evaluation score:\n", np.mean(eval_rewards))
                try:
                    generate_gif(global_frame_index, frames_for_gif,
                                 eval_rewards[0], PATH)
                except IndexError:
                    print("No evaluation game finished")

                #Save the network parameters
                saver.save(sess,
                           PATH + '/my_model',
                           global_step=global_frame_index)
                frames_for_gif = []

                # Show the evaluation score in tensorboard
                summ = sess.run(
                    eval_scope_summary,
                    feed_dict={eval_scope_ph: np.mean(eval_rewards)})
                SUMM_WRITER.add_summary(summ, global_frame_index)
                with open('rewardsEval.dat', 'a') as eval_reward_file:
                    print(global_frame_index,
                          np.mean(eval_rewards),
                          file=eval_reward_file)
Exemplo n.º 2
0
class QAgent:
    def __init__(self, session):
        self.time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        self.session = session
        self.action_size = 4
        self.gamma = 0.99
        self.epsilon = INITIAL_EPSILON
        self.batch_size = 32
        self.learning_rate = 0.00001

        self.replay_mem = ReplayMemory(size=1000000,
                                       frame_height=84,
                                       frame_width=84,
                                       agent_history_length=4,
                                       batch_size=32)
        self.tick = 0
        self.episode = 0
        self.total_reward = 0
        self.last_n_game_reward = deque(maxlen=100)

        # create q network
        self.state_input, self.q_values, self.best_action = self.create_model(
            'main')

        # create target q network
        self.state_input_t, self.q_values_t, self.best_action_t = self.create_model(
            'target')

        # update dqn vars
        self.main_dqn_vars = tf.trainable_variables(scope='main')
        self.target_dqn_vars = tf.trainable_variables(scope='target')

        self.update_ops = []
        for i, var in enumerate(self.main_dqn_vars):
            copy_op = self.target_dqn_vars[i].assign(var.value())
            self.update_ops.append(copy_op)

        # create training
        self.create_trainig()

        # init
        self.session.run(tf.global_variables_initializer())
        #self.update_target_network()

        # saver
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state("saved_networks")
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.session, checkpoint.model_checkpoint_path)
            print(f'Successfully loaded: {checkpoint.model_checkpoint_path}')
        else:
            print('Could not find old network weights')

    def create_trainig(self):
        self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.Q = tf.reduce_sum(tf.multiply(
            self.q_values,
            tf.one_hot(self.action, self.action_size, dtype=tf.float32)),
                               axis=1)

        # loss
        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(
                tf.losses.huber_loss(labels=self.target_q, predictions=self.Q))

        # train
        with tf.variable_scope("training"):
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

    def train(self):
        states, actions, rewards, new_states, terminal_flags = self.replay_mem.get_minibatch(
        )

        # test = states[0]
        # print(test.dtype)
        # print(test.shape)
        # fig = plt.figure(figsize=(1, 4))
        # fig.add_subplot(1, 4, 1)
        # plt.imshow(test[:,:,0])
        # fig.add_subplot(1, 4, 2)
        # plt.imshow(test[:,:,1])
        # fig.add_subplot(1, 4, 3)
        # plt.imshow(test[:,:,2])
        # fig.add_subplot(1, 4, 4)
        # plt.imshow(test[:,:,3])
        # plt.show()
        # ttt

        # main dqn: predict best action for new state
        arg_q_max = self.session.run(self.best_action,
                                     feed_dict={self.state_input: new_states})

        # target qdn: predict Q(s', a)
        q_vals = self.session.run(self.q_values_t,
                                  feed_dict={self.state_input_t: new_states})

        # double q
        double_q = q_vals[range(self.batch_size), arg_q_max]

        # calc bellman equation
        target_q = rewards + (self.gamma * double_q * (1 - terminal_flags))

        # train main dqn
        loss, _ = self.session.run(
            [self.loss, self.train_op],
            feed_dict={
                self.state_input: states,
                self.target_q: target_q,
                self.action: actions
            })

    def process(self, next_frame, action, reward, done, terminal_life_lost):
        self.replay_mem.add_experience(action=action,
                                       frame=next_frame[:, :, 0],
                                       reward=reward,
                                       terminal=terminal_life_lost)

        if self.tick > OBSERVE and self.tick % TRAIN_FREQ:
            self.train()

        # update target network
        if self.tick > OBSERVE and self.tick % UPDATE_TIME == 0:
            print('update target network')
            self.update_target_network()

        # save network every 100000 iteration
        if self.tick > 0 and self.tick % 100000 == 0:
            self.saver.save(self.session,
                            f'saved_networks/dqn-{self.time}',
                            global_step=self.tick)

        self.total_reward += reward

        if done:
            self.last_n_game_reward.append(self.total_reward)
            # print
            print(
                f'Episode: {self.episode}, Reward: {self.total_reward}, Avg. Reward: {np.mean(self.last_n_game_reward)}, Epsilon: {self.epsilon}, Step: {self.tick}'
            )
            self.episode += 1
            self.total_reward = 0

        self.tick += 1

    def get_action(self, state, training=True):
        if training:
            if random.random() < self.epsilon:
                action = np.random.randint(0, 4)
            else:
                action = self.session.run(
                    self.best_action, feed_dict={self.state_input: [state]})[0]

            if self.epsilon > FINAL_EPSILON and self.tick > OBSERVE:
                self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
        else:
            action = self.session.run(self.best_action,
                                      feed_dict={self.state_input: [state]})[0]

        return action

    def update_target_network(self):
        for copy_op in self.update_ops:
            self.session.run(copy_op)

    def create_model(self, name='main'):
        with tf.variable_scope(name):
            stateInput = tf.placeholder(shape=[None, 84, 84, 4],
                                        dtype=tf.float32)

            # Normalizing the input
            inputscaled = stateInput / 255

            # Convolutional layers
            conv1 = tf.layers.conv2d(
                inputs=inputscaled,
                filters=32,
                kernel_size=[8, 8],
                strides=4,
                kernel_initializer=tf.variance_scaling_initializer(scale=2),
                padding="valid",
                activation=tf.nn.relu,
                use_bias=False)
            conv2 = tf.layers.conv2d(
                inputs=conv1,
                filters=64,
                kernel_size=[4, 4],
                strides=2,
                kernel_initializer=tf.variance_scaling_initializer(scale=2),
                padding="valid",
                activation=tf.nn.relu,
                use_bias=False)
            conv3 = tf.layers.conv2d(
                inputs=conv2,
                filters=64,
                kernel_size=[3, 3],
                strides=1,
                kernel_initializer=tf.variance_scaling_initializer(scale=2),
                padding="valid",
                activation=tf.nn.relu,
                use_bias=False)
            conv4 = tf.layers.conv2d(
                inputs=conv3,
                filters=1024,
                kernel_size=[7, 7],
                strides=1,
                kernel_initializer=tf.variance_scaling_initializer(scale=2),
                padding="valid",
                activation=tf.nn.relu,
                use_bias=False)

            # Splitting into value and advantage stream
            valuestream, advantagestream = tf.split(conv4, 2, 3)
            valuestream = tf.layers.flatten(valuestream)
            advantagestream = tf.layers.flatten(advantagestream)
            advantage = tf.layers.dense(
                inputs=advantagestream,
                units=4,
                kernel_initializer=tf.variance_scaling_initializer(scale=2))
            value = tf.layers.dense(
                inputs=valuestream,
                units=1,
                kernel_initializer=tf.variance_scaling_initializer(scale=2))

            # Combining value and advantage into Q-values
            q_values = value + tf.subtract(
                advantage, tf.reduce_mean(advantage, axis=1, keepdims=True))
            best_action = tf.argmax(q_values, 1)

        return stateInput, q_values, best_action
Exemplo n.º 3
0
def train():
    environment = AtariEnvironment(env_name=ENV_NAME,
                                   frame_stack_length=FRAME_STACK_LENGTH)
    main_dqn = DQN(num_actions=environment.action_number,
                   frame_height=FRAME_HEIGHT,
                   frame_width=FRAME_WIDTH,
                   frame_stack_length=FRAME_STACK_LENGTH,
                   hidden=HIDDEN,
                   batch_size=BATCH_SIZE,
                   path=PATH_READ,
                   path2=PATH_WRITE)
    target_dqn = DQN(num_actions=environment.action_number,
                     frame_height=FRAME_HEIGHT,
                     frame_width=FRAME_WIDTH,
                     frame_stack_length=FRAME_STACK_LENGTH,
                     hidden=HIDDEN,
                     batch_size=BATCH_SIZE,
                     path=PATH_READ,
                     path2=PATH_WRITE)
    replay_memory = ReplayMemory(size=MEMORY_SIZE,
                                 frame_height=FRAME_HEIGHT,
                                 frame_width=FRAME_WIDTH,
                                 frame_stack_length=FRAME_STACK_LENGTH,
                                 batch_size=BATCH_SIZE)
    action_selector = ActionSelector(
        dqn=main_dqn,
        num_actions=environment.action_number,
        initial_epsilon=EPSILON_INITIAL,
        middle_epsilon=EPSILON_SECOND,
        finish_epsilon=EPSILON_FINAL,
        minimum_replay_size=REPLAY_MEMORY_START_SIZE,
        maximum_replay_size=MEMORY_SIZE,
        final_frame_number=MAX_FRAMES)
    target_dqn_updater = TargetDqnUpdater(main_dqn=main_dqn,
                                          target_dqn=target_dqn)

    total_frame_number = 0
    rewards_per_episode = {}
    frames_per_episode = {}
    episode = 0

    average_last_100_frames = 0
    average_last_100_reward = 0
    best_score = 0
    open('scores/best_scores.txt', 'w').close()
    open('scores/averages.txt', 'w').close()

    #main_dqn.load_model(400)
    #target_dqn.load_model(400)
    while total_frame_number < MAX_FRAMES:
        episode += 1

        rewards_per_episode[episode] = 0
        frames_per_episode[episode] = 0

        terminal_life_lost = environment.reset_environment(hard_reset=True)

        while frames_per_episode[episode] < MAX_EPISODE_LENGTH:
            action = 1 if terminal_life_lost else action_selector.act(
                environment.current_state, total_frame_number)
            processed_next_frame, reward, terminal, terminal_life_lost = environment.commit_action(
                action)
            replay_memory.add_experience(action, processed_next_frame[:, :, 0],
                                         clip(reward), terminal_life_lost)
            if REPLAY_MEMORY_START_SIZE < total_frame_number:
                if total_frame_number % UPDATE_FREQ == 0:
                    states, actions, rewards, next_states, terminals = replay_memory.sample_minibatch(
                    )

                    # calculate best actions in next states based on main dqn!
                    best_actions = main_dqn.get_best_actions_batch(next_states)

                    #calculate q_values of these actions in next states based on target network!
                    #firstly, one hot encode best found actions
                    ohe_best_actions_next_states = to_categorical(
                        best_actions, num_classes=environment.action_number)
                    ohe_best_actions_current_states = to_categorical(
                        actions, num_classes=environment.action_number)
                    next_states_q_values = target_dqn.predict_batch(
                        next_states, ohe_best_actions_next_states)
                    next_states_best_q_value = np.sum(next_states_q_values,
                                                      axis=1)

                    #the Bellman update ->
                    target_q_values = rewards + (
                        1 -
                        terminals) * DISCOUNT_FACTOR * next_states_best_q_value

                    #gradient descent
                    main_dqn.fit_batch(
                        states, ohe_best_actions_current_states,
                        ohe_best_actions_current_states *
                        np.expand_dims(target_q_values, axis=1))

                if total_frame_number % NETW_UPDATE_FREQ == 0:
                    target_dqn_updater.update_target_network()

            total_frame_number += 1
            frames_per_episode[episode] += 1
            rewards_per_episode[episode] += reward
            if terminal:
                break
            if terminal_life_lost:
                terminal_life_lost = environment.reset_environment(
                    hard_reset=False
                )  #NAKON SVAKOG IZGUBLJENOG ZIVOTA PUCAJ SLUCAJNO KAKO BI SE STVORIO U NOVOJ SITUACIJI

        print("\nEpisode %d ended." % episode)
        print("Reward: %d" % rewards_per_episode[episode])
        print("Frames: %d" % frames_per_episode[episode])
        print("Replay memory size: %d" % replay_memory.get_size())
        print("Current epsilon: %5f\n" % action_selector.eps_debug)

        average_last_100_reward += rewards_per_episode[episode]
        average_last_100_frames += frames_per_episode[episode]

        if best_score < rewards_per_episode[episode]:
            best_score = rewards_per_episode[episode]
            file = open("scores/best_scores.txt", 'a')
            file.write("Episode: " + str(episode) + " | New best score: " +
                       str(best_score) + "\n")
            file.close()

        if episode % 100 == 0:
            file = open("scores/averages.txt", 'a')
            file.write("\nEpisodes %d - %d results:" %
                       (episode - 100, episode))
            average_last_100_reward /= 100
            average_last_100_frames /= 100
            file.write("\nAverage reward per episode: %.5f" %
                       average_last_100_reward)
            file.write("\nAverage frames per episode: %.2f\n" %
                       average_last_100_frames)
            file.close()

            average_last_100_reward = 0
            average_last_100_frames = 0

            if total_frame_number > REPLAY_MEMORY_START_SIZE:
                main_dqn.save_model(episode)