class PseudoCountQLearner(ValueBasedLearner, DensityModelMixin):
    """
    Based on DQN+CTS model from the paper 'Unifying Count-Based Exploration and Intrinsic Motivation' (https://arxiv.org/abs/1606.01868)
    Presently the implementation differs from the paper in that the novelty bonuses are computed online rather than by computing the
    prediction gains after the model has been updated with all frames from the episode. Async training with different final epsilon values
    tends to produce better results than just using a single actor-learner.
    """
    def __init__(self, args):
        self.args = args
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size,
                                          self.local_network.get_input_shape(),
                                          self.num_actions)

        self._init_density_model(args)
        self._double_dqn_op()

    def generate_final_epsilon(self):
        if self.num_actor_learners == 1:
            return self.args.final_epsilon
        else:
            return super(PseudoCountQLearner, self).generate_final_epsilon()

    def _get_summary_vars(self):
        q_vars = super(PseudoCountQLearner, self)._get_summary_vars()

        bonus_q05 = tf.Variable(0., name='novelty_bonus_q05')
        s1 = tf.summary.scalar('Novelty_Bonus_q05_{}'.format(self.actor_id),
                               bonus_q05)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q95 = tf.Variable(0., name='novelty_bonus_q95')
        s3 = tf.summary.scalar('Novelty_Bonus_q95_{}'.format(self.actor_id),
                               bonus_q95)

        augmented_reward = tf.Variable(0., name='augmented_episode_reward')
        s4 = tf.summary.scalar(
            'Augmented_Episode_Reward_{}'.format(self.actor_id),
            augmented_reward)

        return q_vars + [bonus_q05, bonus_q50, bonus_q95, augmented_reward]

    #TODO: refactor to make this cleaner
    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses,
                      total_augmented_reward):
        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value() * self.max_local_steps
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()

            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))

            self.log_summary(
                total_episode_reward,
                episode_ave_max_q,
                self.epsilon,
                np.percentile(bonuses, 5),
                np.percentile(bonuses, 50),
                np.percentile(bonuses, 95),
                total_augmented_reward,
            )

            state = self.emulator.get_initial_state()
            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return (state, total_episode_reward, steps_at_last_reward, ep_t,
                episode_ave_max_q, episode_over)

    def _double_dqn_op(self):
        q_local_action = tf.cast(
            tf.argmax(self.local_network.output_layer, axis=1), tf.int32)
        q_target_max = utils.ops.slice_2d(
            self.target_network.output_layer,
            tf.range(0, self.batch_size),
            q_local_action,
        )
        self.one_step_reward = tf.placeholder(tf.float32,
                                              self.batch_size,
                                              name='one_step_reward')
        self.is_terminal = tf.placeholder(tf.bool,
                                          self.batch_size,
                                          name='is_terminal')

        self.y_target = self.one_step_reward + self.cts_eta*self.gamma*q_target_max \
            * (1 - tf.cast(self.is_terminal, tf.float32))

        self.double_dqn_loss = self.local_network._value_function_loss(
            self.local_network.q_selected_action -
            tf.stop_gradient(self.y_target))

        self.double_dqn_grads = tf.gradients(self.double_dqn_loss,
                                             self.local_network.params)

    # def batch_update(self):
    #     if len(self.replay_memory) < self.replay_memory.maxlen//10:
    #         return

    #     s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size)

    #     feed_dict={
    #         self.one_step_reward: r_i,
    #         self.target_network.input_ph: s_f,
    #         self.local_network.input_ph: np.vstack([s_i, s_f]),
    #         self.local_network.selected_action_ph: np.vstack([a_i, a_i]),
    #         self.is_terminal: is_terminal
    #     }
    #     grads = self.session.run(self.double_dqn_grads, feed_dict=feed_dict)
    #     self.apply_gradients_to_shared_memory_vars(grads)

    def batch_update(self):
        if len(self.replay_memory) < self.replay_memory.maxlen // 10:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        feed_dict = {
            self.local_network.input_ph: s_f,
            self.target_network.input_ph: s_f,
            self.is_terminal: is_terminal,
            self.one_step_reward: r_i,
        }
        y_target = self.session.run(self.y_target, feed_dict=feed_dict)

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)
        self.apply_gradients_to_shared_memory_vars(grads)

    def train(self):
        """ Main actor learner loop for n-step Q learning. """
        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = list()
        a_batch = list()
        y_batch = list()
        bonuses = deque(maxlen=1000)
        episode_over = False

        t0 = time.time()
        global_steps_at_last_record = self.global_step.value()
        while (self.global_step.value() < self.max_global_steps):
            # # Sync local learning net with shared mem
            # self.sync_net_with_shared_memory(self.local_network, self.learning_vars)
            # self.save_vars()
            rewards = list()
            states = list()
            actions = list()
            max_q_values = list()
            local_step_start = self.local_step
            total_episode_reward = 0
            total_augmented_reward = 0
            episode_ave_max_q = 0
            ep_t = 0

            while not episode_over:
                # Sync local learning net with shared mem
                self.sync_net_with_shared_memory(self.local_network,
                                                 self.learning_vars)
                self.save_vars()

                # Choose next action and execute it
                a, q_values = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward
                max_q = np.max(q_values)

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)
                total_augmented_reward += reward
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)
                max_q_values.append(max_q)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += max_q

                global_step, _ = self.global_step.increment()

                if global_step % self.q_target_update_steps == 0:
                    self.update_target()
                if global_step % self.density_model_update_steps == 0:
                    self.write_density_model()

                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0
                if self.density_model_update_flags.updated[self.actor_id] == 1:
                    self.read_density_model()
                    self.density_model_update_flags.updated[self.actor_id] = 0

                if self.local_step % self.q_update_interval == 0:
                    self.batch_update()

                if self.is_master() and (self.local_step % 500 == 0):
                    bonus_array = np.array(bonuses)
                    steps = global_step - global_steps_at_last_record
                    global_steps_at_last_record = global_step

                    logger.debug(
                        'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.
                        format(bonus_array.mean(), bonus_array.max(),
                               steps / float(time.time() - t0)))
                    t0 = time.time()

            else:
                #compute monte carlo return
                mc_returns = np.zeros((len(rewards), ), dtype=np.float32)
                running_total = 0.0
                for i, r in enumerate(reversed(rewards)):
                    running_total = r + self.gamma * running_total
                    mc_returns[len(rewards) - i - 1] = running_total

                mixed_returns = self.cts_eta * np.asarray(rewards) + (
                    1 - self.cts_eta) * mc_returns

                #update replay memory
                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(states[i], actions[i],
                                              mixed_returns[i],
                                              i + 1 == episode_length)

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses, total_augmented_reward)
class BasePGQLearner(BaseA3CLearner):
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        self.q_update_counter = 0
        self.replay_size = args.replay_size
        self.pgq_fraction = args.pgq_fraction
        self.batch_update_size = args.batch_update_size
        scope_name = 'local_learning_{}'.format(self.actor_id)
        conf_learning = {'name': scope_name,
                         'input_shape': self.input_shape,
                         'num_act': self.num_actions,
                         'args': args}

        with tf.device('/cpu:0'):
            self.local_network = PolicyValueNetwork(conf_learning)
        with tf.device('/gpu:0'), tf.variable_scope('', reuse=True):
            self.batch_network = PolicyValueNetwork(conf_learning)
            self._build_q_ops()

        self.reset_hidden_state()
        self.replay_memory = ReplayMemory(
            self.replay_size,
            self.local_network.get_input_shape(),
            self.num_actions)
            
        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, 
                                        keep_checkpoint_every_n_hours=2)


    def _build_q_ops(self):
        # pgq specific initialization
        self.pgq_fraction = self.pgq_fraction
        self.batch_size = self.batch_update_size
        self.q_tilde = self.batch_network.beta * (
            self.batch_network.log_output_layer_pi
            + tf.expand_dims(self.batch_network.output_layer_entropy, 1)
        ) + self.batch_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde)
        self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.output_layer_v)
        self.log_pi, _ = tf.split(axis=0, num_or_size_splits=2, value=tf.expand_dims(self.batch_network.log_output_selected_action, 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator')
        self.max_TQ = self.gamma*tf.reduce_max(self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.selected_action_ph)[0], 1)

        self.q_objective = - self.pgq_fraction * tf.reduce_mean(tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (0.5 * self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.batch_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)
        self.q_gradients = self.batch_network._clip_grads(self.q_gradients)


    def batch_q_update(self):
        if len(self.replay_memory) < self.replay_memory.maxlen//10:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size)

        batch_grads = self.session.run(
            self.q_gradients,
            feed_dict={
                self.R: r_i,
                self.batch_network.selected_action_ph: np.vstack([a_i, a_i]),
                self.batch_network.input_ph: np.vstack([s_i, s_f]),
                self.terminal_indicator: is_terminal.astype(np.int),
            }
        )
        self.apply_gradients_to_shared_memory_vars(batch_grads)
예제 #3
0
class DQNAgent:
    def __init__(self, config):
        self.config = config

        self.logger = logging.getLogger("DQNAgent")

        # define models (policy and target)
        self.policy_model = DQN(self.config)
        self.target_model = DQN(self.config)

        # define memory
        self.memory = ReplayMemory(self.config)

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.RMSprop(self.policy_model.parameters())

        # define environment
        self.env = gym.make('CartPole-v0').unwrapped
        self.cartpole = CartPoleEnv(self.config.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = self.config.batch_size

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()
        if self.is_cuda and not self.config.cuda:
            self.logger.info(
                "WARNING: You have a CUDA device, so you should probably enable CUDA"
            )

        self.cuda = self.is_cuda & self.config.cuda

        if self.cuda:
            self.logger.info("Program will run on *****GPU-CUDA***** ")
            print_cuda_statistics()
            self.device = torch.device("cuda")
            torch.cuda.set_device(self.config.gpu_device)
        else:
            self.logger.info("Program will run on *****CPU***** ")
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        # Summary Writer
        self.summary_writer = SummaryWriter(log_dir=self.config.summary_dir,
                                            comment='DQN')

    def load_checkpoint(self, file_name):
        filename = self.config.checkpoint_dir + file_name
        try:
            self.logger.info("Loading checkpoint '{}'".format(filename))
            checkpoint = torch.load(filename)

            self.current_episode = checkpoint['episode']
            self.current_iteration = checkpoint['iteration']
            self.policy_model.load_state_dict(checkpoint['state_dict'])
            self.optim.load_state_dict(checkpoint['optimizer'])

            self.logger.info(
                "Checkpoint loaded successfully from '{}' at (epoch {}) at (iteration {})\n"
                .format(self.config.checkpoint_dir, checkpoint['episode'],
                        checkpoint['iteration']))
        except OSError as e:
            self.logger.info(
                "No checkpoint exists from '{}'. Skipping...".format(
                    self.config.checkpoint_dir))
            self.logger.info("**First time to train**")

    def save_checkpoint(self, file_name="checkpoint.pth.tar", is_best=0):
        state = {
            'episode': self.current_episode,
            'iteration': self.current_iteration,
            'state_dict': self.policy_model.state_dict(),
            'optimizer': self.optim.state_dict(),
        }
        # Save the state
        torch.save(state, self.config.checkpoint_dir + file_name)
        # If it is the best copy it to another file 'model_best.pth.tar'
        if is_best:
            shutil.copyfile(self.config.checkpoint_dir + file_name,
                            self.config.checkpoint_dir + 'model_best.pth.tar')

    def run(self):
        """
        This function will the operator
        :return:
        """
        try:
            self.train()

        except KeyboardInterrupt:
            self.logger.info("You have entered CTRL+C.. Wait to finalize")

    def select_action(self, state):
        """
        The action selection function, it either uses the model to choose an action or samples one uniformly.
        :param state: current state of the model
        :return:
        """
        if self.cuda:
            state = state.cuda()
        sample = random.random()
        eps_threshold = self.config.eps_start + (
            self.config.eps_start - self.config.eps_end) * math.exp(
                -1. * self.current_iteration / self.config.eps_decay)
        self.current_iteration += 1
        if sample > eps_threshold:
            with torch.no_grad():
                return self.policy_model(state).max(1)[1].view(1,
                                                               1)  # size (1,1)
        else:
            return torch.tensor([[random.randrange(2)]],
                                device=self.device,
                                dtype=torch.long)

    def optimize_policy_model(self):
        """
        performs a single step of optimization for the policy model
        :return:
        """
        if self.memory.length() < self.batch_size:
            return
        # sample a batch
        transitions = self.memory.sample_batch(self.batch_size)

        one_batch = Transition(*zip(*transitions))

        # create a mask of non-final states
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, one_batch.next_state)),
                                      device=self.device,
                                      dtype=torch.uint8)  # [128]
        non_final_next_states = torch.cat([
            s for s in one_batch.next_state if s is not None
        ])  # [< 128, 3, 40, 80]

        # concatenate all batch elements into one
        state_batch = torch.cat(one_batch.state)  # [128, 3, 40, 80]
        action_batch = torch.cat(one_batch.action)  # [128, 1]
        reward_batch = torch.cat(one_batch.reward)  # [128]

        state_batch = state_batch.to(self.device)
        non_final_next_states = non_final_next_states.to(self.device)

        curr_state_values = self.policy_model(state_batch)  # [128, 2]
        curr_state_action_values = curr_state_values.gather(
            1, action_batch)  # [128, 1]

        # Get V(s_{t+1}) for all next states. By definition we set V(s)=0 if s is a terminal state.
        next_state_values = torch.zeros(self.batch_size,
                                        device=self.device)  # [128]
        next_state_values[non_final_mask] = self.target_model(
            non_final_next_states).max(1)[0].detach()  # [< 128]

        # Get the expected Q values
        expected_state_action_values = (
            next_state_values * self.config.gamma) + reward_batch  # [128]
        # compute loss: temporal difference error
        loss = self.loss(curr_state_action_values,
                         expected_state_action_values.unsqueeze(1))

        # optimizer step
        self.optim.zero_grad()
        loss.backward()
        for param in self.policy_model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim.step()

        return loss

    def train(self):
        """
        Training loop based on the number of episodes
        :return:
        """
        for episode in tqdm(
                range(self.current_episode, self.config.num_episodes)):
            self.current_episode = episode
            # reset environment
            self.env.reset()
            self.train_one_epoch()
            # The target network has its weights kept frozen most of the time
            if self.current_episode % self.config.target_update == 0:
                self.target_model.load_state_dict(
                    self.policy_model.state_dict())

        self.env.render()
        self.env.close()

    def train_one_epoch(self):
        """
        One episode of training; it samples an action, observe next screen and optimize the model once
        :return:
        """
        episode_duration = 0
        prev_frame = self.cartpole.get_screen(self.env)
        curr_frame = self.cartpole.get_screen(self.env)
        # get state
        curr_state = curr_frame - prev_frame

        while (1):
            episode_duration += 1
            # select action
            action = self.select_action(curr_state)
            # perform action and get reward
            _, reward, done, _ = self.env.step(action.item())

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            prev_frame = curr_frame
            curr_frame = self.cartpole.get_screen(self.env)
            # assign next state
            if done:
                next_state = None
            else:
                next_state = curr_frame - prev_frame

            # add this transition into memory
            self.memory.push_transition(curr_state, action, next_state, reward)

            curr_state = next_state

            # Policy model optimization step
            curr_loss = self.optimize_policy_model()
            if curr_loss is not None:
                if self.cuda:
                    curr_loss = curr_loss.cpu()
                self.summary_writer.add_scalar("Temporal Difference Loss",
                                               curr_loss.detach().numpy(),
                                               self.current_iteration)
            # check if done
            if done:
                break

        self.summary_writer.add_scalar("Training Episode Duration",
                                       episode_duration, self.current_episode)

    def validate(self):
        pass

    def finalize(self):
        """
        Finalize all the operations of the 2 Main classes of the process the operator and the data loader
        :return:
        """
        self.logger.info(
            "Please wait while finalizing the operation.. Thank you")
        self.save_checkpoint()
        self.summary_writer.export_scalars_to_json("{}all_scalars.json".format(
            self.config.summary_dir))
        self.summary_writer.close()
class PseudoCountQLearner(ValueBasedLearner):
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def generate_final_epsilon(self):
        return 0.1

    def _get_summary_vars(self):
        q_vars = super(PseudoCountQLearner, self)._get_summary_vars()

        bonus_q25 = tf.Variable(0., name='novelty_bonus_q25')
        s1 = tf.summary.scalar('Novelty_Bonus_q25_{}'.format(self.actor_id),
                               bonus_q25)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q75 = tf.Variable(0., name='novelty_bonus_q75')
        s3 = tf.summary.scalar('Novelty_Bonus_q75_{}'.format(self.actor_id),
                               bonus_q75)

        return q_vars + [bonus_q25, bonus_q50, bonus_q75]

    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses):
        # prevent the agent from getting stuck
        reset_game = False
        if (self.local_step - steps_at_last_reward > 5000
                or (self.emulator.get_lives() == 0
                    and self.emulator.game not in ONE_LIFE_GAMES)):

            steps_at_last_reward = self.local_step
            episode_over = True
            reset_game = True

        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value()
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()

            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))

            if self.is_master() and self.is_train:
                stats = [
                    total_episode_reward,
                    episode_ave_max_q,
                    self.epsilon,
                    np.percentile(bonuses, 25),
                    np.percentile(bonuses, 50),
                    np.percentile(bonuses, 75),
                ]
                feed_dict = {
                    self.summary_ph[i]: stats[i]
                    for i in range(len(stats))
                }
                res = self.session.run(self.update_ops + [self.summary_op],
                                       feed_dict=feed_dict)
                self.summary_writer.add_summary(res[-1],
                                                self.global_step.value())

            if reset_game or self.emulator.game in ONE_LIFE_GAMES:
                state = self.emulator.get_initial_state()

            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over

    def batch_update(self):
        if len(self.replay_memory) < self.batch_size:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        q_target_values = self.session.run(
            self.target_network.output_layer,
            feed_dict={self.target_network.input_ph: s_f})
        y_target = r_i + self.cts_eta * self.gamma * q_target_values.max(
            axis=1) * (1 - is_terminal.astype(np.int))

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)

        self.apply_gradients_to_shared_memory_vars(grads)

    def _run(self):
        """ Main actor learner loop for n-step Q learning. """
        if not self.is_train:
            return self.test()

        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = []
        a_batch = []
        y_batch = []
        bonuses = deque(maxlen=100)

        exec_update_target = False
        total_episode_reward = 0
        episode_ave_max_q = 0
        episode_over = False
        qmax_down = 0
        qmax_up = 0
        prev_qmax = -10 * 6
        low_qmax = 0
        ep_t = 0

        t0 = time.time()
        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            rewards = []
            states = []
            actions = []
            local_step_start = self.local_step

            while not episode_over:
                # Choose next action and execute it
                a, readout_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug(
                        'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.
                        format(bonus_array.mean(), bonus_array.max(),
                               100. / (time.time() - t0)))
                    t0 = time.time()

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += np.max(readout_t)

                global_step, update_target = self.global_step.increment(
                    self.q_target_update_steps)

                if update_target:
                    update_target = False
                    exec_update_target = True

                if self.local_step % 4 == 0:
                    self.batch_update()

                self.local_network.global_step = global_step

            else:
                mc_returns = list()
                running_total = 0.0
                for r in reversed(rewards):
                    running_total = r + self.gamma * running_total
                    mc_returns.insert(0, running_total)

                mixed_returns = self.cts_eta * np.array(rewards) + (
                    1 - self.cts_eta) * np.array(mc_returns)

                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(
                        (states[i], actions[i], mixed_returns[i],
                         states[i + 1], i + 1 == episode_length))

            if exec_update_target:
                self.update_target()
                exec_update_target = False
                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses)
예제 #5
0
class BasePGQLearner(BaseA3CLearner):
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        # args.entropy_regularisation_strength = 0.0
        conf_learning = {
            'name': 'local_learning_{}'.format(self.actor_id),
            'input_shape': self.input_shape,
            'num_act': self.num_actions,
            'args': args
        }

        self.local_network = PolicyValueNetwork(conf_learning)
        self.reset_hidden_state()

        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list,
                                        max_to_keep=3,
                                        keep_checkpoint_every_n_hours=2)

        # pgq specific initialization
        self.batch_size = 32
        self.pgq_fraction = args.pgq_fraction
        self.replay_memory = ReplayMemory(args.replay_size)
        self.q_tilde = self.local_network.beta * (
            self.local_network.log_output_layer_pi +
            tf.expand_dims(self.local_network.output_layer_entropy,
                           1)) + self.local_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0,
                                           num_or_size_splits=2,
                                           value=self.q_tilde)
        self.V, _ = tf.split(axis=0,
                             num_or_size_splits=2,
                             value=self.local_network.output_layer_v)
        self.log_pi, _ = tf.split(
            axis=0,
            num_or_size_splits=2,
            value=tf.expand_dims(self.local_network.log_output_selected_action,
                                 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None],
                                                 name='terminal_indicator')
        self.max_TQ = self.gamma * tf.reduce_max(
            self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(
            self.Qi * tf.split(axis=0,
                               num_or_size_splits=2,
                               value=self.local_network.selected_action_ph)[0],
            1)

        self.q_objective = -self.pgq_fraction * tf.reduce_mean(
            tf.stop_gradient(self.R + self.max_TQ - self.Q_a) *
            (self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.local_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)

        if self.local_network.clip_norm_type == 'global':
            self.q_gradients = tf.clip_by_global_norm(
                self.q_gradients, self.local_network.clip_norm)[0]
        elif self.local_network.clip_norm_type == 'local':
            self.q_gradients = [
                tf.clip_by_norm(g, self.local_network.clip_norm)
                for g in self.q_gradients
            ]

        if (self.optimizer_mode == "local"):
            if (self.optimizer_type == "rmsprop"):
                self.batch_opt_st = np.ones(size, dtype=ctypes.c_float)
            else:
                self.batch_opt_st = np.zeros(size, dtype=ctypes.c_float)
        elif (self.optimizer_mode == "shared"):
            self.batch_opt_st = args.batch_opt_state

    def apply_batch_q_update(self):
        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        batch_grads, max_TQ, Q_a = self.session.run(
            [self.q_gradients, self.max_TQ, self.Q_a],
            feed_dict={
                self.R: r_i,
                self.local_network.selected_action_ph: np.vstack([a_i, a_i]),
                self.local_network.input_ph: np.vstack([s_i, s_f]),
                self.terminal_indicator: is_terminal.astype(np.int),
            })
        # print 'max_TQ={}, Q_a={}'.format(max_TQ[:5], Q_a[:5])

        self._apply_gradients_to_shared_memory_vars(batch_grads,
                                                    opt_st=self.batch_opt_st)

    def softmax(self, x, temperature):
        x /= temperature
        exp_x = np.exp(x - np.max(x))

        return exp_x / exp_x.sum()
예제 #6
0
class PseudoCountQLearner(ValueBasedLearner):
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def generate_final_epsilon(self):
        return 0.1

    def batch_update(self):
        if len(self.replay_memory) < self.batch_size:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        q_target_values = self.session.run(
            self.target_network.output_layer,
            feed_dict={self.target_network.input_ph: s_f})
        y_target = r_i + self.cts_eta * self.gamma * q_target_values.max(
            axis=1) * (1 - is_terminal.astype(np.int))

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)

        self.apply_gradients_to_shared_memory_vars(grads)

    def _run(self):
        """ Main actor learner loop for n-step Q learning. """
        if not self.is_train:
            return self.test()

        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = []
        a_batch = []
        y_batch = []
        bonuses = deque(maxlen=100)

        exec_update_target = False
        total_episode_reward = 0
        episode_ave_max_q = 0
        episode_over = False
        qmax_down = 0
        qmax_up = 0
        prev_qmax = -10 * 6
        low_qmax = 0
        ep_t = 0

        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            rewards = []
            states = []
            actions = []
            local_step_start = self.local_step

            while not episode_over:
                # Choose next action and execute it
                a, readout_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug('Mean Bonus={:.4f} / Max Bonus={:.4f}'.format(
                        bonus_array.mean(), bonus_array.max()))

                # Rescale or clip immediate reward
                # reward = self.rescale_reward(self.rescale_reward(reward) + bonus)
                reward = self.rescale_reward(reward)
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += np.max(readout_t)

                global_step, update_target = self.global_step.increment(
                    self.q_target_update_steps)

                if update_target:
                    update_target = False
                    exec_update_target = True

                if self.local_step % 4 == 0:
                    self.batch_update()

                self.local_network.global_step = global_step

            else:
                mc_returns = list()
                running_total = 0.0
                for r in reversed(rewards):
                    running_total = r + self.gamma * running_total
                    mc_returns.insert(0, running_total)

                mixed_returns = self.cts_eta * np.array(rewards) + (
                    1 - self.cts_eta) * np.array(mc_returns)

                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(
                        (states[i], actions[i], mixed_returns[i],
                         states[i + 1], i + 1 == episode_length))

            if exec_update_target:
                self.update_target()
                exec_update_target = False
                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over)
class AElearner(ValueBasedLearner, DensityModelMixinAE):
    def __init__(self, args):
        self.args = args

        super(AElearner, self).__init__(args)
        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.ae_delta = args.ae_delta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(
            args.replay_size,
            self.local_network_upper.get_input_shape(),
            # self.local_network.get_input_shape(),
            self.num_actions)
        #inits desity model(chooses how many steps for update )
        #20 * q targt update steps
        self._init_density_model(args)
        #computes loss
        self._double_dqn_op()
        self.which_net_to_update_counter = 0
        self.ae_counter = 0
        self.epsilon_greedy_counter = 0
        self.total_ae_counter = 0
        self.total_epsilon_greedy_counter = 0
        self.q_values_upper_max = []
        self.q_values_lower_max = []
        self.ae_valid_actions = True
        self.action_meanings = self.emulator.env.unwrapped.get_action_meanings(
        )
        self.minimized_actions_counter = {
            value: 0
            for value in self.action_meanings
        }
        print(self.minimized_actions_counter)
        # print("In AE class")

    def beta_function(self, A, S, delta, k, Vmax, c):
        #print (utils.fast_cts.__name__)
        #print("This is temp {}".format(temp))
        #print("This is k")
        #print(k)
        if k < 1:
            k = 1
        # print("c is : {}".format(c))
        # print("k is : {}".format(k))
        # print("S is : {}".format(S))
        # print("A is : {}".format(A))
        # print("delta is : {}".format(delta))
        # # print("c*(k-1)*(k-1)*S*A is : {}".format(c*(k-1)*(k-1)*S*A))
        # print("c*(k1)*(k1)*S*A/delta is : {}".format(c*(k)*(k)*S*A/delta))
        # print("math.log(c*(k-1)*(k-1)*S*A/delta is : {}".format(math.log(c*(k)*(k)*S*A/delta)))
        # #k = math.maximum(k,1)
        # z = 5
        # assert(math.isnan(5))
        assert (not math.isnan(math.log(
            c * k * k * S * A / delta))), "log of left is nan"
        left = math.sqrt(k * math.log(c * k * k * S * A / delta))
        assert (not math.isnan(left)), " left side of beta is Nan"

        if k == 1:
            right = 0
        else:
            right = math.sqrt(
                (k - 1) *
                math.log(c * (k - 1) *
                         (k - 1) * S * A / delta))  #the error is here
        assert (not math.isnan(right)), " right side of beta is Nan"

        beta = k * Vmax * (left - (1 - 1 / k) * right)
        assert (not math.isnan(beta)), " right side of beta is Nan"
        return beta

    ### pay attention: call it for upper q
    ### Returns minimized action pool after AE according to the paper(Q upper is larger than V lower)
    def minimize_action_pool(self, state):
        new_actions = np.zeros([self.num_actions])
        #TODO get q upperbound values
        #TODO: target or local ???
        # q_values_upper = self.session.run(
        #         self.target_network_upper.output_layer,
        #         feed_dict={self.target_network_upper.input_ph: [state]})[0]
        # q_values_lower = self.session.run(
        #         self.target_network_lower.output_layer,
        #         feed_dict={self.target_network_lower.input_ph: [state]})[0]
        q_values_upper = self.session.run(
            self.local_network_upper.output_layer,
            feed_dict={self.local_network_upper.input_ph: [state]})[0]
        q_values_lower = self.session.run(
            self.local_network_lower.output_layer,
            feed_dict={self.local_network_lower.input_ph: [state]})[0]
        #TODO V lower upperbound
        Vlow = max(q_values_lower)
        Vhigh = max(q_values_upper)
        #print("q_values_lower: {} / q_values_upper: {}".format(q_values_lower, q_values_upper))

        # print("The value of Vlow is {}".format(Vlow))
        for index, action in enumerate(new_actions):
            new_actions[index] = q_values_upper[index] >= Vlow
            if q_values_upper[index] < Vlow:
                self.minimized_actions_counter[
                    self.action_meanings[index]] += 1
            # print("The value of q_values_upper on index: {} is :{}".format(index,q_values_upper[index]))
        #print("new actions are:  {}".format(new_actions))
        #print("new actions array: {}".format(new_actions))
        return new_actions, q_values_lower, q_values_upper

    def choose_next_action(self, state):
        #print("we use our AE new algorithm choose next action")
        new_action = np.zeros([self.num_actions])
        q_values = self.session.run(
            self.local_network_upper.output_layer,
            feed_dict={self.local_network_upper.input_ph: [state]})[0]
        # q_values_upper = self.session.run(
        #         self.target_network_upper.output_layer,
        #         feed_dict={self.target_network_upper.input_ph: [state]})[0]
        # q_values_lower = self.session.run(
        #         self.target_network_lower.output_layer,
        #         feed_dict={self.target_network_lower.input_ph: [state]})[0]
        # Vlow = max(q_values_lower)
        # Vhigh = max(q_values_upper)
        # print("Vlow is: {}".format(Vlow))
        # print("q_upper values: {}".format(q_values_upper))

        #self.q_values_lower_max.append(Vlow)
        #self.q_values_lower_max.append(Vhigh)

        #print("q_upper: {}".format(q_upper_curr))
        #print("q_lower: {}".format(q_lower_curr))
        secure_random = random.SystemRandom()
        action_pool, q_values_lower, q_values_upper = self.minimize_action_pool(
            state)
        if self.local_step % 500 == 0:
            #num_actions_minimized = self.num_actions - np.sum(action_pool)

            #minimized_actions = [ self.action_meanings[index] for index,value in enumerate (action_pool) if value == 0 ]
            logger.info('Total minimized actions{0} / LOCAL STEP {1} '.format(
                self.minimized_actions_counter, self.local_step))
        #print("action pool is: {}".format(action_pool))
        # print("The action pool {}".format(action_pool))
        random_index = secure_random.randrange(0, len(action_pool))
        indexes_valid_actions = []
        for i, item in enumerate(action_pool):
            if item == 1:
                indexes_valid_actions.append(i)

        #There are no actions after elimination
        #Using epsilon greedy from all the actions
        if not indexes_valid_actions:
            #print("q_values_lower: {} / q_values_upper: {}".format(q_values_lower, q_values_upper))
            #print("no valid ae actions!!! - use epsilon greedy")
            self.ae_valid_actions = False
            self.epsilon_greedy_counter += 1
            super_return = super(AElearner, self).choose_next_action(state)
            return super_return[0], super_return[
                1], q_values_lower, q_values_upper
        self.ae_counter += 1
        random_index = secure_random.choice(indexes_valid_actions)

        new_action[random_index] = 1
        self.reduce_thread_epsilon()
        #print("succefuly eliminated actions")
        #print("new action is: {}".format(new_action))
        #print("q_values (upper): {}".format(q_values))
        return new_action, q_values, q_values_lower, q_values_upper

    def generate_final_epsilon(self):
        if self.num_actor_learners == 1:
            return self.args.final_epsilon
        else:
            return super(AElearner, self).generate_final_epsilon()

    def _get_summary_vars(self):
        q_vars = super(AElearner, self)._get_summary_vars()

        bonus_q05 = tf.Variable(0., name='novelty_bonus_q05')
        s1 = tf.summary.scalar('Novelty_Bonus_q05_{}'.format(self.actor_id),
                               bonus_q05)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q95 = tf.Variable(0., name='novelty_bonus_q95')
        s3 = tf.summary.scalar('Novelty_Bonus_q95_{}'.format(self.actor_id),
                               bonus_q95)

        augmented_reward = tf.Variable(0., name='augmented_episode_reward')
        s4 = tf.summary.scalar(
            'Augmented_Episode_Reward_{}'.format(self.actor_id),
            augmented_reward)

        return q_vars + [bonus_q05, bonus_q50, bonus_q95, augmented_reward]

    #TODO: refactor to make this cleaner
    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses,
                      total_augmented_reward, q_values_lower, q_values_upper):
        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value() * self.max_local_steps
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()
            print("Used AE for {} times".format(self.ae_counter))
            print("Used Epsilon greedy for {} times".format(
                self.epsilon_greedy_counter))
            self.total_ae_counter += self.ae_counter
            self.total_epsilon_greedy_counter += self.epsilon_greedy_counter
            self.ae_counter = 0
            self.epsilon_greedy_counter = 0
            print("Total count of use of AE is {} :".format(
                self.total_ae_counter))
            print("Total count of use of Epsilone Greedy {}".format(
                self.total_epsilon_greedy_counter))
            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} +- {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))
            logger.info("q_values_lower: {} / q_values_upper: {}".format(
                q_values_lower, q_values_upper))
            #print(" T type {}".format(type(T)))
            self.vis.plot_current_errors(T, total_episode_reward)
            #self.vis.plot_total_ae_counter(T,self.minimized_actions_counter, self.action_meanings)
            self.vis.plot_q_values(T, q_values_lower, q_values_upper,
                                   self.action_meanings)
            self.wr.writerow([T])
            self.wr.writerow([total_episode_reward])
            #print(" total episode reward type {}".format(type(total_episode_reward)))

            #print ('[%s]' % ', '.join(map(str, t.vis.plot_data['X'])))

            self.log_summary(
                total_episode_reward,
                episode_ave_max_q,
                self.epsilon,
                np.percentile(bonuses, 5),
                np.percentile(bonuses, 50),
                np.percentile(bonuses, 95),
                total_augmented_reward,
            )

            state = self.emulator.get_initial_state()
            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return (state, total_episode_reward, steps_at_last_reward, ep_t,
                episode_ave_max_q, episode_over)

    def _double_dqn_op(self):
        q_local_action_lower = tf.cast(
            tf.argmax(self.local_network_lower.output_layer, axis=1), tf.int32)
        q_target_max_lower = utils.ops.slice_2d(
            self.target_network_lower.output_layer,
            tf.range(0, self.batch_size),
            q_local_action_lower,
        )

        q_local_action_upper = tf.cast(
            tf.argmax(self.local_network_upper.output_layer, axis=1), tf.int32)
        q_target_max_upper = utils.ops.slice_2d(
            self.target_network_upper.output_layer,
            tf.range(0, self.batch_size),
            q_local_action_upper,
        )

        self.one_step_reward = tf.placeholder(tf.float32,
                                              self.batch_size,
                                              name='one_step_reward')
        self.is_terminal = tf.placeholder(tf.bool,
                                          self.batch_size,
                                          name='is_terminal')

        self.y_target_lower = self.one_step_reward + self.cts_eta*self.gamma*q_target_max_lower \
            * (1 - tf.cast(self.is_terminal, tf.float32))

        self.y_target_upper = self.one_step_reward + self.cts_eta*self.gamma*q_target_max_upper \
            * (1 - tf.cast(self.is_terminal, tf.float32))

        self.double_dqn_loss_lower = self.local_network_lower._value_function_loss(
            self.local_network_lower.q_selected_action -
            tf.stop_gradient(self.y_target_lower))

        self.double_dqn_loss_upper = self.local_network_upper._value_function_loss(
            self.local_network_upper.q_selected_action -
            tf.stop_gradient(self.y_target_upper))

        self.double_dqn_grads_lower = tf.gradients(
            self.double_dqn_loss_lower, self.local_network_lower.params)
        self.double_dqn_grads_upper = tf.gradients(
            self.double_dqn_loss_upper, self.local_network_upper.params)

    # def batch_update(self):
    #     if len(self.replay_memory) < self.replay_memory.maxlen//10:
    #         return

    #     s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size)

    #     feed_dict={
    #         self.one_step_reward: r_i,
    #         self.target_network.input_ph: s_f,
    #         self.local_network.input_ph: np.vstack([s_i, s_f]),
    #         self.local_network.selected_action_ph: np.vstack([a_i, a_i]),
    #         self.is_terminal: is_terminal
    #     }
    #     grads = self.session.run(self.double_dqn_grads, feed_dict=feed_dict)
    #     self.apply_gradients_to_shared_memory_vars(grads)

    def batch_update(self):
        if len(self.replay_memory) < self.replay_memory.maxlen // 10:
            return
        #TODO check if we need two replay memories
        s_i, a_i, r_i, s_f, is_terminal, b_i = self.replay_memory.sample_batch(
            self.batch_size)
        #print("This is b_i {}".format(b_i))

        # if(self.which_net_to_update_counter %2):

        feed_dict = {
            self.local_network_upper.input_ph: s_f,
            self.target_network_upper.input_ph: s_f,
            self.is_terminal: is_terminal,
            self.one_step_reward: r_i + b_i,
        }
        y_target_upper = self.session.run(self.y_target_upper,
                                          feed_dict=feed_dict)
        #print(y_target_upper)
        feed_dict = {
            self.local_network_upper.input_ph: s_i,
            self.local_network_upper.target_ph: y_target_upper,
            self.local_network_upper.selected_action_ph: a_i
        }
        #TODO , the exception of nan happens here.
        #print(self.local_network_upper.get_gradients)
        grads = self.session.run(self.local_network_upper.get_gradients,
                                 feed_dict=feed_dict)

        #assert (not tf.debugging.is_nan(grads)) , " upper local network grads are nan"
        self.apply_gradients_to_shared_memory_vars(grads,
                                                   upper_or_lower="Upper")
        # else:
        feed_dict = {
            self.local_network_lower.input_ph: s_f,
            self.target_network_lower.input_ph: s_f,
            self.is_terminal: is_terminal,
            self.one_step_reward: r_i - b_i,
        }
        y_target_lower = self.session.run(self.y_target_lower,
                                          feed_dict=feed_dict)

        feed_dict = {
            self.local_network_lower.input_ph: s_i,
            self.local_network_lower.target_ph: y_target_lower,
            self.local_network_lower.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network_lower.get_gradients,
                                 feed_dict=feed_dict)
        #assert (not tf.debugging.is_nan(grads)) , " lower local network grads are nan"
        self.apply_gradients_to_shared_memory_vars(grads,
                                                   upper_or_lower="Lower")

    def train(self):
        """ Main actor learner loop for n-step Q learning. """
        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()
        # print(" In train of AE")
        s_batch = list()
        a_batch = list()
        y_batch = list()
        bonuses = deque(maxlen=1000)
        episode_over = False

        t0 = time.time()
        global_steps_at_last_record = self.global_step.value()
        while (self.global_step.value() < self.max_global_steps):
            # # Sync local learning net with shared mem
            # self.sync_net_with_shared_memory(self.local_network, self.learning_vars)
            # self.save_vars()
            rewards = list()
            states = list()
            actions = list()
            max_q_values = list()
            bonuses = list()
            local_step_start = self.local_step
            total_episode_reward = 0
            total_augmented_reward = 0
            episode_ave_max_q = 0
            ep_t = 0
            action_count = 0

            while not episode_over:

                A = self.num_actions
                S = 100000000
                factor_delta_scale_to_1_10 = 100000000000
                factor_delta_actual = 100
                factor_divide_bonus_scale_to_0_10 = 100000000
                factor_divide_bonus = 1000
                #S = 2**S
                delta = self.ae_delta * factor_delta_actual
                #delta = self.ae_delta * factor_delta # experimental: trying to stabilize the bonus along the entire learning
                Vmax = 100000
                c = 5

                # Sync local learning net with shared mem
                #TODO: upper / lower
                self.sync_net_with_shared_memory(self.local_network_lower,
                                                 self.learning_vars_lower)
                self.sync_net_with_shared_memory(self.local_network_upper,
                                                 self.learning_vars_upper)
                self.save_vars()

                # Choose next action and execute it
                # print("intrinsic motivation print")

                a, q_values, q_values_lower, q_values_upper = self.choose_next_action(
                    s)
                action_count += 1
                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward
                max_q = np.max(q_values)
                prev_s = s
                current_frame = new_s[..., -1]
                prev_frame = prev_s[..., -1]
                #print("This is a {}".format(a))
                index_of_a = np.argmax(a)
                ## TODO change back to update 2 and understand the underlying
                ## cython code
                k = (self.density_model[index_of_a]).update(prev_frame)
                #print(type(self.density_model[index_of_a]))
                assert (not math.isnan(k)), "k is nan"
                # print("K value is {}".format(k))
                #You should trace the update call here, as I recall it leads to the c funtion in a c file
                # And not to the python function

                ## TODO: change S to the correct number (numebr of states until now or what is supposed to be double by k)
                # k = k * S

                bonus = self.beta_function(A, S, delta, k, Vmax, c)

                #  The bonus isn't supposed to be the output of beta beta_function
                #  In the parer it is normilized by k
                #  TODO: check what should we use as the bonus
                if k > 1:
                    bonus = bonus / k
                    bonus = bonus / factor_divide_bonus

                #print("this is k")
                #print (k)

                #bonus = 1196518.8710327097
                #print("bonus is: {}".format(bonus))

                # Rescale or clip immediate reward
                reward = self.rescale_reward(self.rescale_reward(reward))
                # TODO figure out how to rescale bonus
                #bonus = self.rescale_reward(bonus)
                total_augmented_reward += reward
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)
                bonuses.append(bonus)
                max_q_values.append(max_q)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += max_q

                global_step, _ = self.global_step.increment()
                ##We update the target network here
                if global_step % self.q_target_update_steps == 0:
                    self.update_target()
                    print("We are updating the target networks")
                    #print("Current k value")
                ## We update the desity model here
                if global_step % self.density_model_update_steps == 0:
                    #returns the index of the chosen action
                    self.write_density_model(np.argmax(a))

                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network_lower,
                                                     self.target_vars_lower)
                    self.sync_net_with_shared_memory(self.target_network_upper,
                                                     self.target_vars_upper)
                    #TODO: check if needed to duplicate target_update_flags for both nets
                    self.target_update_flags.updated[self.actor_id] = 0
                #print("type of self.density_model_updated: {}".format(type(self.density_model_update_flags)))
                for action in range(len(self.density_model_update_flags)):
                    if self.density_model_update_flags[action].updated[
                            self.actor_id] == 1:
                        #returns the index of the chosen action
                        self.read_density_model(np.argmax(a))
                        self.density_model_update_flags[action].updated[
                            self.actor_id] = 0

                if self.local_step % self.q_update_interval == 0:
                    self.batch_update()
                    self.which_net_to_update_counter += 1

                if self.is_master() and (self.local_step % 500 == 0):
                    #bonus_array = np.array(bonuses)
                    steps = global_step - global_steps_at_last_record
                    global_steps_at_last_record = global_step

                    #logger.debug('Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.format(
                    #    bonus_array.mean(), bonus_array.max(), steps/float(time.time()-t0)))
                    t0 = time.time()

            else:
                #compute monte carlo return
                mc_returns = np.zeros((len(rewards), ), dtype=np.float32)
                running_total = 0.0
                for i, r in enumerate(reversed(rewards)):
                    running_total = r + self.gamma * running_total
                    mc_returns[len(rewards) - i - 1] = running_total

                mixed_returns = self.cts_eta * np.asarray(rewards) + (
                    1 - self.cts_eta) * mc_returns

                #update replay memory
                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(states[i], actions[i],
                                              mixed_returns[i],
                                              i + 1 == episode_length,
                                              bonuses[i])

            #print("Vlow is: {}".format(Vlow))
            #print("q_upper values: {}".format(q_values_upper))

            #self.q_values_lower_max.append(Vlow)
            #self.q_values_lower_max.append(Vhigh)

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                    self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses, total_augmented_reward, q_values_lower, q_values_upper)