class PseudoCountQLearner(ValueBasedLearner, DensityModelMixin):
    """
    Based on DQN+CTS model from the paper 'Unifying Count-Based Exploration and Intrinsic Motivation' (https://arxiv.org/abs/1606.01868)
    Presently the implementation differs from the paper in that the novelty bonuses are computed online rather than by computing the
    prediction gains after the model has been updated with all frames from the episode. Async training with different final epsilon values
    tends to produce better results than just using a single actor-learner.
    """
    def __init__(self, args):
        self.args = args
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size,
                                          self.local_network.get_input_shape(),
                                          self.num_actions)

        self._init_density_model(args)
        self._double_dqn_op()

    def generate_final_epsilon(self):
        if self.num_actor_learners == 1:
            return self.args.final_epsilon
        else:
            return super(PseudoCountQLearner, self).generate_final_epsilon()

    def _get_summary_vars(self):
        q_vars = super(PseudoCountQLearner, self)._get_summary_vars()

        bonus_q05 = tf.Variable(0., name='novelty_bonus_q05')
        s1 = tf.summary.scalar('Novelty_Bonus_q05_{}'.format(self.actor_id),
                               bonus_q05)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q95 = tf.Variable(0., name='novelty_bonus_q95')
        s3 = tf.summary.scalar('Novelty_Bonus_q95_{}'.format(self.actor_id),
                               bonus_q95)

        augmented_reward = tf.Variable(0., name='augmented_episode_reward')
        s4 = tf.summary.scalar(
            'Augmented_Episode_Reward_{}'.format(self.actor_id),
            augmented_reward)

        return q_vars + [bonus_q05, bonus_q50, bonus_q95, augmented_reward]

    #TODO: refactor to make this cleaner
    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses,
                      total_augmented_reward):
        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value() * self.max_local_steps
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()

            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))

            self.log_summary(
                total_episode_reward,
                episode_ave_max_q,
                self.epsilon,
                np.percentile(bonuses, 5),
                np.percentile(bonuses, 50),
                np.percentile(bonuses, 95),
                total_augmented_reward,
            )

            state = self.emulator.get_initial_state()
            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return (state, total_episode_reward, steps_at_last_reward, ep_t,
                episode_ave_max_q, episode_over)

    def _double_dqn_op(self):
        q_local_action = tf.cast(
            tf.argmax(self.local_network.output_layer, axis=1), tf.int32)
        q_target_max = utils.ops.slice_2d(
            self.target_network.output_layer,
            tf.range(0, self.batch_size),
            q_local_action,
        )
        self.one_step_reward = tf.placeholder(tf.float32,
                                              self.batch_size,
                                              name='one_step_reward')
        self.is_terminal = tf.placeholder(tf.bool,
                                          self.batch_size,
                                          name='is_terminal')

        self.y_target = self.one_step_reward + self.cts_eta*self.gamma*q_target_max \
            * (1 - tf.cast(self.is_terminal, tf.float32))

        self.double_dqn_loss = self.local_network._value_function_loss(
            self.local_network.q_selected_action -
            tf.stop_gradient(self.y_target))

        self.double_dqn_grads = tf.gradients(self.double_dqn_loss,
                                             self.local_network.params)

    # def batch_update(self):
    #     if len(self.replay_memory) < self.replay_memory.maxlen//10:
    #         return

    #     s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size)

    #     feed_dict={
    #         self.one_step_reward: r_i,
    #         self.target_network.input_ph: s_f,
    #         self.local_network.input_ph: np.vstack([s_i, s_f]),
    #         self.local_network.selected_action_ph: np.vstack([a_i, a_i]),
    #         self.is_terminal: is_terminal
    #     }
    #     grads = self.session.run(self.double_dqn_grads, feed_dict=feed_dict)
    #     self.apply_gradients_to_shared_memory_vars(grads)

    def batch_update(self):
        if len(self.replay_memory) < self.replay_memory.maxlen // 10:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        feed_dict = {
            self.local_network.input_ph: s_f,
            self.target_network.input_ph: s_f,
            self.is_terminal: is_terminal,
            self.one_step_reward: r_i,
        }
        y_target = self.session.run(self.y_target, feed_dict=feed_dict)

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)
        self.apply_gradients_to_shared_memory_vars(grads)

    def train(self):
        """ Main actor learner loop for n-step Q learning. """
        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = list()
        a_batch = list()
        y_batch = list()
        bonuses = deque(maxlen=1000)
        episode_over = False

        t0 = time.time()
        global_steps_at_last_record = self.global_step.value()
        while (self.global_step.value() < self.max_global_steps):
            # # Sync local learning net with shared mem
            # self.sync_net_with_shared_memory(self.local_network, self.learning_vars)
            # self.save_vars()
            rewards = list()
            states = list()
            actions = list()
            max_q_values = list()
            local_step_start = self.local_step
            total_episode_reward = 0
            total_augmented_reward = 0
            episode_ave_max_q = 0
            ep_t = 0

            while not episode_over:
                # Sync local learning net with shared mem
                self.sync_net_with_shared_memory(self.local_network,
                                                 self.learning_vars)
                self.save_vars()

                # Choose next action and execute it
                a, q_values = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward
                max_q = np.max(q_values)

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)
                total_augmented_reward += reward
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)
                max_q_values.append(max_q)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += max_q

                global_step, _ = self.global_step.increment()

                if global_step % self.q_target_update_steps == 0:
                    self.update_target()
                if global_step % self.density_model_update_steps == 0:
                    self.write_density_model()

                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0
                if self.density_model_update_flags.updated[self.actor_id] == 1:
                    self.read_density_model()
                    self.density_model_update_flags.updated[self.actor_id] = 0

                if self.local_step % self.q_update_interval == 0:
                    self.batch_update()

                if self.is_master() and (self.local_step % 500 == 0):
                    bonus_array = np.array(bonuses)
                    steps = global_step - global_steps_at_last_record
                    global_steps_at_last_record = global_step

                    logger.debug(
                        'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.
                        format(bonus_array.mean(), bonus_array.max(),
                               steps / float(time.time() - t0)))
                    t0 = time.time()

            else:
                #compute monte carlo return
                mc_returns = np.zeros((len(rewards), ), dtype=np.float32)
                running_total = 0.0
                for i, r in enumerate(reversed(rewards)):
                    running_total = r + self.gamma * running_total
                    mc_returns[len(rewards) - i - 1] = running_total

                mixed_returns = self.cts_eta * np.asarray(rewards) + (
                    1 - self.cts_eta) * mc_returns

                #update replay memory
                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(states[i], actions[i],
                                              mixed_returns[i],
                                              i + 1 == episode_length)

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses, total_augmented_reward)
Exemplo n.º 2
0
class PseudoCountQLearner(ValueBasedLearner):
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def generate_final_epsilon(self):
        return 0.1

    def batch_update(self):
        if len(self.replay_memory) < self.batch_size:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        q_target_values = self.session.run(
            self.target_network.output_layer,
            feed_dict={self.target_network.input_ph: s_f})
        y_target = r_i + self.cts_eta * self.gamma * q_target_values.max(
            axis=1) * (1 - is_terminal.astype(np.int))

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)

        self.apply_gradients_to_shared_memory_vars(grads)

    def _run(self):
        """ Main actor learner loop for n-step Q learning. """
        if not self.is_train:
            return self.test()

        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = []
        a_batch = []
        y_batch = []
        bonuses = deque(maxlen=100)

        exec_update_target = False
        total_episode_reward = 0
        episode_ave_max_q = 0
        episode_over = False
        qmax_down = 0
        qmax_up = 0
        prev_qmax = -10 * 6
        low_qmax = 0
        ep_t = 0

        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            rewards = []
            states = []
            actions = []
            local_step_start = self.local_step

            while not episode_over:
                # Choose next action and execute it
                a, readout_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug('Mean Bonus={:.4f} / Max Bonus={:.4f}'.format(
                        bonus_array.mean(), bonus_array.max()))

                # Rescale or clip immediate reward
                # reward = self.rescale_reward(self.rescale_reward(reward) + bonus)
                reward = self.rescale_reward(reward)
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += np.max(readout_t)

                global_step, update_target = self.global_step.increment(
                    self.q_target_update_steps)

                if update_target:
                    update_target = False
                    exec_update_target = True

                if self.local_step % 4 == 0:
                    self.batch_update()

                self.local_network.global_step = global_step

            else:
                mc_returns = list()
                running_total = 0.0
                for r in reversed(rewards):
                    running_total = r + self.gamma * running_total
                    mc_returns.insert(0, running_total)

                mixed_returns = self.cts_eta * np.array(rewards) + (
                    1 - self.cts_eta) * np.array(mc_returns)

                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(
                        (states[i], actions[i], mixed_returns[i],
                         states[i + 1], i + 1 == episode_length))

            if exec_update_target:
                self.update_target()
                exec_update_target = False
                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over)
class PseudoCountQLearner(ValueBasedLearner):
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def generate_final_epsilon(self):
        return 0.1

    def _get_summary_vars(self):
        q_vars = super(PseudoCountQLearner, self)._get_summary_vars()

        bonus_q25 = tf.Variable(0., name='novelty_bonus_q25')
        s1 = tf.summary.scalar('Novelty_Bonus_q25_{}'.format(self.actor_id),
                               bonus_q25)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q75 = tf.Variable(0., name='novelty_bonus_q75')
        s3 = tf.summary.scalar('Novelty_Bonus_q75_{}'.format(self.actor_id),
                               bonus_q75)

        return q_vars + [bonus_q25, bonus_q50, bonus_q75]

    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses):
        # prevent the agent from getting stuck
        reset_game = False
        if (self.local_step - steps_at_last_reward > 5000
                or (self.emulator.get_lives() == 0
                    and self.emulator.game not in ONE_LIFE_GAMES)):

            steps_at_last_reward = self.local_step
            episode_over = True
            reset_game = True

        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value()
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()

            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))

            if self.is_master() and self.is_train:
                stats = [
                    total_episode_reward,
                    episode_ave_max_q,
                    self.epsilon,
                    np.percentile(bonuses, 25),
                    np.percentile(bonuses, 50),
                    np.percentile(bonuses, 75),
                ]
                feed_dict = {
                    self.summary_ph[i]: stats[i]
                    for i in range(len(stats))
                }
                res = self.session.run(self.update_ops + [self.summary_op],
                                       feed_dict=feed_dict)
                self.summary_writer.add_summary(res[-1],
                                                self.global_step.value())

            if reset_game or self.emulator.game in ONE_LIFE_GAMES:
                state = self.emulator.get_initial_state()

            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over

    def batch_update(self):
        if len(self.replay_memory) < self.batch_size:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        q_target_values = self.session.run(
            self.target_network.output_layer,
            feed_dict={self.target_network.input_ph: s_f})
        y_target = r_i + self.cts_eta * self.gamma * q_target_values.max(
            axis=1) * (1 - is_terminal.astype(np.int))

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)

        self.apply_gradients_to_shared_memory_vars(grads)

    def _run(self):
        """ Main actor learner loop for n-step Q learning. """
        if not self.is_train:
            return self.test()

        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = []
        a_batch = []
        y_batch = []
        bonuses = deque(maxlen=100)

        exec_update_target = False
        total_episode_reward = 0
        episode_ave_max_q = 0
        episode_over = False
        qmax_down = 0
        qmax_up = 0
        prev_qmax = -10 * 6
        low_qmax = 0
        ep_t = 0

        t0 = time.time()
        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            rewards = []
            states = []
            actions = []
            local_step_start = self.local_step

            while not episode_over:
                # Choose next action and execute it
                a, readout_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug(
                        'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.
                        format(bonus_array.mean(), bonus_array.max(),
                               100. / (time.time() - t0)))
                    t0 = time.time()

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += np.max(readout_t)

                global_step, update_target = self.global_step.increment(
                    self.q_target_update_steps)

                if update_target:
                    update_target = False
                    exec_update_target = True

                if self.local_step % 4 == 0:
                    self.batch_update()

                self.local_network.global_step = global_step

            else:
                mc_returns = list()
                running_total = 0.0
                for r in reversed(rewards):
                    running_total = r + self.gamma * running_total
                    mc_returns.insert(0, running_total)

                mixed_returns = self.cts_eta * np.array(rewards) + (
                    1 - self.cts_eta) * np.array(mc_returns)

                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(
                        (states[i], actions[i], mixed_returns[i],
                         states[i + 1], i + 1 == episode_length))

            if exec_update_target:
                self.update_target()
                exec_update_target = False
                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses)
class AElearner(ValueBasedLearner, DensityModelMixinAE):
    def __init__(self, args):
        self.args = args

        super(AElearner, self).__init__(args)
        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.ae_delta = args.ae_delta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(
            args.replay_size,
            self.local_network_upper.get_input_shape(),
            # self.local_network.get_input_shape(),
            self.num_actions)
        #inits desity model(chooses how many steps for update )
        #20 * q targt update steps
        self._init_density_model(args)
        #computes loss
        self._double_dqn_op()
        self.which_net_to_update_counter = 0
        self.ae_counter = 0
        self.epsilon_greedy_counter = 0
        self.total_ae_counter = 0
        self.total_epsilon_greedy_counter = 0
        self.q_values_upper_max = []
        self.q_values_lower_max = []
        self.ae_valid_actions = True
        self.action_meanings = self.emulator.env.unwrapped.get_action_meanings(
        )
        self.minimized_actions_counter = {
            value: 0
            for value in self.action_meanings
        }
        print(self.minimized_actions_counter)
        # print("In AE class")

    def beta_function(self, A, S, delta, k, Vmax, c):
        #print (utils.fast_cts.__name__)
        #print("This is temp {}".format(temp))
        #print("This is k")
        #print(k)
        if k < 1:
            k = 1
        # print("c is : {}".format(c))
        # print("k is : {}".format(k))
        # print("S is : {}".format(S))
        # print("A is : {}".format(A))
        # print("delta is : {}".format(delta))
        # # print("c*(k-1)*(k-1)*S*A is : {}".format(c*(k-1)*(k-1)*S*A))
        # print("c*(k1)*(k1)*S*A/delta is : {}".format(c*(k)*(k)*S*A/delta))
        # print("math.log(c*(k-1)*(k-1)*S*A/delta is : {}".format(math.log(c*(k)*(k)*S*A/delta)))
        # #k = math.maximum(k,1)
        # z = 5
        # assert(math.isnan(5))
        assert (not math.isnan(math.log(
            c * k * k * S * A / delta))), "log of left is nan"
        left = math.sqrt(k * math.log(c * k * k * S * A / delta))
        assert (not math.isnan(left)), " left side of beta is Nan"

        if k == 1:
            right = 0
        else:
            right = math.sqrt(
                (k - 1) *
                math.log(c * (k - 1) *
                         (k - 1) * S * A / delta))  #the error is here
        assert (not math.isnan(right)), " right side of beta is Nan"

        beta = k * Vmax * (left - (1 - 1 / k) * right)
        assert (not math.isnan(beta)), " right side of beta is Nan"
        return beta

    ### pay attention: call it for upper q
    ### Returns minimized action pool after AE according to the paper(Q upper is larger than V lower)
    def minimize_action_pool(self, state):
        new_actions = np.zeros([self.num_actions])
        #TODO get q upperbound values
        #TODO: target or local ???
        # q_values_upper = self.session.run(
        #         self.target_network_upper.output_layer,
        #         feed_dict={self.target_network_upper.input_ph: [state]})[0]
        # q_values_lower = self.session.run(
        #         self.target_network_lower.output_layer,
        #         feed_dict={self.target_network_lower.input_ph: [state]})[0]
        q_values_upper = self.session.run(
            self.local_network_upper.output_layer,
            feed_dict={self.local_network_upper.input_ph: [state]})[0]
        q_values_lower = self.session.run(
            self.local_network_lower.output_layer,
            feed_dict={self.local_network_lower.input_ph: [state]})[0]
        #TODO V lower upperbound
        Vlow = max(q_values_lower)
        Vhigh = max(q_values_upper)
        #print("q_values_lower: {} / q_values_upper: {}".format(q_values_lower, q_values_upper))

        # print("The value of Vlow is {}".format(Vlow))
        for index, action in enumerate(new_actions):
            new_actions[index] = q_values_upper[index] >= Vlow
            if q_values_upper[index] < Vlow:
                self.minimized_actions_counter[
                    self.action_meanings[index]] += 1
            # print("The value of q_values_upper on index: {} is :{}".format(index,q_values_upper[index]))
        #print("new actions are:  {}".format(new_actions))
        #print("new actions array: {}".format(new_actions))
        return new_actions, q_values_lower, q_values_upper

    def choose_next_action(self, state):
        #print("we use our AE new algorithm choose next action")
        new_action = np.zeros([self.num_actions])
        q_values = self.session.run(
            self.local_network_upper.output_layer,
            feed_dict={self.local_network_upper.input_ph: [state]})[0]
        # q_values_upper = self.session.run(
        #         self.target_network_upper.output_layer,
        #         feed_dict={self.target_network_upper.input_ph: [state]})[0]
        # q_values_lower = self.session.run(
        #         self.target_network_lower.output_layer,
        #         feed_dict={self.target_network_lower.input_ph: [state]})[0]
        # Vlow = max(q_values_lower)
        # Vhigh = max(q_values_upper)
        # print("Vlow is: {}".format(Vlow))
        # print("q_upper values: {}".format(q_values_upper))

        #self.q_values_lower_max.append(Vlow)
        #self.q_values_lower_max.append(Vhigh)

        #print("q_upper: {}".format(q_upper_curr))
        #print("q_lower: {}".format(q_lower_curr))
        secure_random = random.SystemRandom()
        action_pool, q_values_lower, q_values_upper = self.minimize_action_pool(
            state)
        if self.local_step % 500 == 0:
            #num_actions_minimized = self.num_actions - np.sum(action_pool)

            #minimized_actions = [ self.action_meanings[index] for index,value in enumerate (action_pool) if value == 0 ]
            logger.info('Total minimized actions{0} / LOCAL STEP {1} '.format(
                self.minimized_actions_counter, self.local_step))
        #print("action pool is: {}".format(action_pool))
        # print("The action pool {}".format(action_pool))
        random_index = secure_random.randrange(0, len(action_pool))
        indexes_valid_actions = []
        for i, item in enumerate(action_pool):
            if item == 1:
                indexes_valid_actions.append(i)

        #There are no actions after elimination
        #Using epsilon greedy from all the actions
        if not indexes_valid_actions:
            #print("q_values_lower: {} / q_values_upper: {}".format(q_values_lower, q_values_upper))
            #print("no valid ae actions!!! - use epsilon greedy")
            self.ae_valid_actions = False
            self.epsilon_greedy_counter += 1
            super_return = super(AElearner, self).choose_next_action(state)
            return super_return[0], super_return[
                1], q_values_lower, q_values_upper
        self.ae_counter += 1
        random_index = secure_random.choice(indexes_valid_actions)

        new_action[random_index] = 1
        self.reduce_thread_epsilon()
        #print("succefuly eliminated actions")
        #print("new action is: {}".format(new_action))
        #print("q_values (upper): {}".format(q_values))
        return new_action, q_values, q_values_lower, q_values_upper

    def generate_final_epsilon(self):
        if self.num_actor_learners == 1:
            return self.args.final_epsilon
        else:
            return super(AElearner, self).generate_final_epsilon()

    def _get_summary_vars(self):
        q_vars = super(AElearner, self)._get_summary_vars()

        bonus_q05 = tf.Variable(0., name='novelty_bonus_q05')
        s1 = tf.summary.scalar('Novelty_Bonus_q05_{}'.format(self.actor_id),
                               bonus_q05)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q95 = tf.Variable(0., name='novelty_bonus_q95')
        s3 = tf.summary.scalar('Novelty_Bonus_q95_{}'.format(self.actor_id),
                               bonus_q95)

        augmented_reward = tf.Variable(0., name='augmented_episode_reward')
        s4 = tf.summary.scalar(
            'Augmented_Episode_Reward_{}'.format(self.actor_id),
            augmented_reward)

        return q_vars + [bonus_q05, bonus_q50, bonus_q95, augmented_reward]

    #TODO: refactor to make this cleaner
    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses,
                      total_augmented_reward, q_values_lower, q_values_upper):
        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value() * self.max_local_steps
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()
            print("Used AE for {} times".format(self.ae_counter))
            print("Used Epsilon greedy for {} times".format(
                self.epsilon_greedy_counter))
            self.total_ae_counter += self.ae_counter
            self.total_epsilon_greedy_counter += self.epsilon_greedy_counter
            self.ae_counter = 0
            self.epsilon_greedy_counter = 0
            print("Total count of use of AE is {} :".format(
                self.total_ae_counter))
            print("Total count of use of Epsilone Greedy {}".format(
                self.total_epsilon_greedy_counter))
            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} +- {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))
            logger.info("q_values_lower: {} / q_values_upper: {}".format(
                q_values_lower, q_values_upper))
            #print(" T type {}".format(type(T)))
            self.vis.plot_current_errors(T, total_episode_reward)
            #self.vis.plot_total_ae_counter(T,self.minimized_actions_counter, self.action_meanings)
            self.vis.plot_q_values(T, q_values_lower, q_values_upper,
                                   self.action_meanings)
            self.wr.writerow([T])
            self.wr.writerow([total_episode_reward])
            #print(" total episode reward type {}".format(type(total_episode_reward)))

            #print ('[%s]' % ', '.join(map(str, t.vis.plot_data['X'])))

            self.log_summary(
                total_episode_reward,
                episode_ave_max_q,
                self.epsilon,
                np.percentile(bonuses, 5),
                np.percentile(bonuses, 50),
                np.percentile(bonuses, 95),
                total_augmented_reward,
            )

            state = self.emulator.get_initial_state()
            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return (state, total_episode_reward, steps_at_last_reward, ep_t,
                episode_ave_max_q, episode_over)

    def _double_dqn_op(self):
        q_local_action_lower = tf.cast(
            tf.argmax(self.local_network_lower.output_layer, axis=1), tf.int32)
        q_target_max_lower = utils.ops.slice_2d(
            self.target_network_lower.output_layer,
            tf.range(0, self.batch_size),
            q_local_action_lower,
        )

        q_local_action_upper = tf.cast(
            tf.argmax(self.local_network_upper.output_layer, axis=1), tf.int32)
        q_target_max_upper = utils.ops.slice_2d(
            self.target_network_upper.output_layer,
            tf.range(0, self.batch_size),
            q_local_action_upper,
        )

        self.one_step_reward = tf.placeholder(tf.float32,
                                              self.batch_size,
                                              name='one_step_reward')
        self.is_terminal = tf.placeholder(tf.bool,
                                          self.batch_size,
                                          name='is_terminal')

        self.y_target_lower = self.one_step_reward + self.cts_eta*self.gamma*q_target_max_lower \
            * (1 - tf.cast(self.is_terminal, tf.float32))

        self.y_target_upper = self.one_step_reward + self.cts_eta*self.gamma*q_target_max_upper \
            * (1 - tf.cast(self.is_terminal, tf.float32))

        self.double_dqn_loss_lower = self.local_network_lower._value_function_loss(
            self.local_network_lower.q_selected_action -
            tf.stop_gradient(self.y_target_lower))

        self.double_dqn_loss_upper = self.local_network_upper._value_function_loss(
            self.local_network_upper.q_selected_action -
            tf.stop_gradient(self.y_target_upper))

        self.double_dqn_grads_lower = tf.gradients(
            self.double_dqn_loss_lower, self.local_network_lower.params)
        self.double_dqn_grads_upper = tf.gradients(
            self.double_dqn_loss_upper, self.local_network_upper.params)

    # def batch_update(self):
    #     if len(self.replay_memory) < self.replay_memory.maxlen//10:
    #         return

    #     s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size)

    #     feed_dict={
    #         self.one_step_reward: r_i,
    #         self.target_network.input_ph: s_f,
    #         self.local_network.input_ph: np.vstack([s_i, s_f]),
    #         self.local_network.selected_action_ph: np.vstack([a_i, a_i]),
    #         self.is_terminal: is_terminal
    #     }
    #     grads = self.session.run(self.double_dqn_grads, feed_dict=feed_dict)
    #     self.apply_gradients_to_shared_memory_vars(grads)

    def batch_update(self):
        if len(self.replay_memory) < self.replay_memory.maxlen // 10:
            return
        #TODO check if we need two replay memories
        s_i, a_i, r_i, s_f, is_terminal, b_i = self.replay_memory.sample_batch(
            self.batch_size)
        #print("This is b_i {}".format(b_i))

        # if(self.which_net_to_update_counter %2):

        feed_dict = {
            self.local_network_upper.input_ph: s_f,
            self.target_network_upper.input_ph: s_f,
            self.is_terminal: is_terminal,
            self.one_step_reward: r_i + b_i,
        }
        y_target_upper = self.session.run(self.y_target_upper,
                                          feed_dict=feed_dict)
        #print(y_target_upper)
        feed_dict = {
            self.local_network_upper.input_ph: s_i,
            self.local_network_upper.target_ph: y_target_upper,
            self.local_network_upper.selected_action_ph: a_i
        }
        #TODO , the exception of nan happens here.
        #print(self.local_network_upper.get_gradients)
        grads = self.session.run(self.local_network_upper.get_gradients,
                                 feed_dict=feed_dict)

        #assert (not tf.debugging.is_nan(grads)) , " upper local network grads are nan"
        self.apply_gradients_to_shared_memory_vars(grads,
                                                   upper_or_lower="Upper")
        # else:
        feed_dict = {
            self.local_network_lower.input_ph: s_f,
            self.target_network_lower.input_ph: s_f,
            self.is_terminal: is_terminal,
            self.one_step_reward: r_i - b_i,
        }
        y_target_lower = self.session.run(self.y_target_lower,
                                          feed_dict=feed_dict)

        feed_dict = {
            self.local_network_lower.input_ph: s_i,
            self.local_network_lower.target_ph: y_target_lower,
            self.local_network_lower.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network_lower.get_gradients,
                                 feed_dict=feed_dict)
        #assert (not tf.debugging.is_nan(grads)) , " lower local network grads are nan"
        self.apply_gradients_to_shared_memory_vars(grads,
                                                   upper_or_lower="Lower")

    def train(self):
        """ Main actor learner loop for n-step Q learning. """
        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()
        # print(" In train of AE")
        s_batch = list()
        a_batch = list()
        y_batch = list()
        bonuses = deque(maxlen=1000)
        episode_over = False

        t0 = time.time()
        global_steps_at_last_record = self.global_step.value()
        while (self.global_step.value() < self.max_global_steps):
            # # Sync local learning net with shared mem
            # self.sync_net_with_shared_memory(self.local_network, self.learning_vars)
            # self.save_vars()
            rewards = list()
            states = list()
            actions = list()
            max_q_values = list()
            bonuses = list()
            local_step_start = self.local_step
            total_episode_reward = 0
            total_augmented_reward = 0
            episode_ave_max_q = 0
            ep_t = 0
            action_count = 0

            while not episode_over:

                A = self.num_actions
                S = 100000000
                factor_delta_scale_to_1_10 = 100000000000
                factor_delta_actual = 100
                factor_divide_bonus_scale_to_0_10 = 100000000
                factor_divide_bonus = 1000
                #S = 2**S
                delta = self.ae_delta * factor_delta_actual
                #delta = self.ae_delta * factor_delta # experimental: trying to stabilize the bonus along the entire learning
                Vmax = 100000
                c = 5

                # Sync local learning net with shared mem
                #TODO: upper / lower
                self.sync_net_with_shared_memory(self.local_network_lower,
                                                 self.learning_vars_lower)
                self.sync_net_with_shared_memory(self.local_network_upper,
                                                 self.learning_vars_upper)
                self.save_vars()

                # Choose next action and execute it
                # print("intrinsic motivation print")

                a, q_values, q_values_lower, q_values_upper = self.choose_next_action(
                    s)
                action_count += 1
                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward
                max_q = np.max(q_values)
                prev_s = s
                current_frame = new_s[..., -1]
                prev_frame = prev_s[..., -1]
                #print("This is a {}".format(a))
                index_of_a = np.argmax(a)
                ## TODO change back to update 2 and understand the underlying
                ## cython code
                k = (self.density_model[index_of_a]).update(prev_frame)
                #print(type(self.density_model[index_of_a]))
                assert (not math.isnan(k)), "k is nan"
                # print("K value is {}".format(k))
                #You should trace the update call here, as I recall it leads to the c funtion in a c file
                # And not to the python function

                ## TODO: change S to the correct number (numebr of states until now or what is supposed to be double by k)
                # k = k * S

                bonus = self.beta_function(A, S, delta, k, Vmax, c)

                #  The bonus isn't supposed to be the output of beta beta_function
                #  In the parer it is normilized by k
                #  TODO: check what should we use as the bonus
                if k > 1:
                    bonus = bonus / k
                    bonus = bonus / factor_divide_bonus

                #print("this is k")
                #print (k)

                #bonus = 1196518.8710327097
                #print("bonus is: {}".format(bonus))

                # Rescale or clip immediate reward
                reward = self.rescale_reward(self.rescale_reward(reward))
                # TODO figure out how to rescale bonus
                #bonus = self.rescale_reward(bonus)
                total_augmented_reward += reward
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)
                bonuses.append(bonus)
                max_q_values.append(max_q)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += max_q

                global_step, _ = self.global_step.increment()
                ##We update the target network here
                if global_step % self.q_target_update_steps == 0:
                    self.update_target()
                    print("We are updating the target networks")
                    #print("Current k value")
                ## We update the desity model here
                if global_step % self.density_model_update_steps == 0:
                    #returns the index of the chosen action
                    self.write_density_model(np.argmax(a))

                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network_lower,
                                                     self.target_vars_lower)
                    self.sync_net_with_shared_memory(self.target_network_upper,
                                                     self.target_vars_upper)
                    #TODO: check if needed to duplicate target_update_flags for both nets
                    self.target_update_flags.updated[self.actor_id] = 0
                #print("type of self.density_model_updated: {}".format(type(self.density_model_update_flags)))
                for action in range(len(self.density_model_update_flags)):
                    if self.density_model_update_flags[action].updated[
                            self.actor_id] == 1:
                        #returns the index of the chosen action
                        self.read_density_model(np.argmax(a))
                        self.density_model_update_flags[action].updated[
                            self.actor_id] = 0

                if self.local_step % self.q_update_interval == 0:
                    self.batch_update()
                    self.which_net_to_update_counter += 1

                if self.is_master() and (self.local_step % 500 == 0):
                    #bonus_array = np.array(bonuses)
                    steps = global_step - global_steps_at_last_record
                    global_steps_at_last_record = global_step

                    #logger.debug('Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.format(
                    #    bonus_array.mean(), bonus_array.max(), steps/float(time.time()-t0)))
                    t0 = time.time()

            else:
                #compute monte carlo return
                mc_returns = np.zeros((len(rewards), ), dtype=np.float32)
                running_total = 0.0
                for i, r in enumerate(reversed(rewards)):
                    running_total = r + self.gamma * running_total
                    mc_returns[len(rewards) - i - 1] = running_total

                mixed_returns = self.cts_eta * np.asarray(rewards) + (
                    1 - self.cts_eta) * mc_returns

                #update replay memory
                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(states[i], actions[i],
                                              mixed_returns[i],
                                              i + 1 == episode_length,
                                              bonuses[i])

            #print("Vlow is: {}".format(Vlow))
            #print("q_upper values: {}".format(q_values_upper))

            #self.q_values_lower_max.append(Vlow)
            #self.q_values_lower_max.append(Vhigh)

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                    self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses, total_augmented_reward, q_values_lower, q_values_upper)