def __init__(self, args):
        super(PseudoCountA3CLearner, self).__init__(args)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)
Exemplo n.º 3
0
 def _init_density_model(self, args):
     model_args = {
         'height': args.cts_rescale_dim,
         'width': args.cts_rescale_dim,
         'num_bins': args.cts_bins,
         'beta': args.cts_beta
     }
     if args.density_model == 'cts':
         self.density_model = CTSDensityModel(**model_args)
     else:
         self.density_model = PerPixelDensityModel(**model_args)
    def _init_density_model(self, args):
        self.density_model_update_steps = 20 * args.q_target_update_steps
        self.density_model_update_flags = args.density_model_update_flags

        model_args = {
            'height': args.cts_rescale_dim,
            'width': args.cts_rescale_dim,
            'num_bins': args.cts_bins,
            'beta': args.cts_beta
        }
        if args.density_model == 'cts':
            self.density_model = CTSDensityModel(**model_args)
        else:
            self.density_model = PerPixelDensityModel(**model_args)
    def _init_density_model(self, args):
        self.density_model_update_steps = 20 * args.q_target_update_steps
        self.alg_type = args.alg_type
        #self.density_model_update_flags = args.density_model_update_flags
        self.density_model_update_flags = []
        for x in range(0, args.num_actions):
            self.density_model_update_flags.append(
                args.density_model_update_flags)
            #print("x is: {}".format(x))

        model_args = {
            'height': args.cts_rescale_dim,
            'width': args.cts_rescale_dim,
            'num_bins': args.cts_bins,  #TODO check what this is
            'beta': args.cts_beta,
        }
        self.density_model = []
        for x in range(0, args.num_actions):
            if args.density_model == 'cts':
                self.density_model.append(CTSDensityModel(**model_args))
            else:
                self.density_model.append(PerPixelDensityModel(**model_args))
class PseudoCountA3CLearner(A3CLearner):
    def __init__(self, args):
        super(PseudoCountA3CLearner, self).__init__(args)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def _run(self):
        if not self.is_train:
            return self.test()
        """ Main actor learner loop for advantage actor critic learning. """
        logger.debug("Actor {} resuming at Step {}".format(
            self.actor_id, self.global_step.value()))

        s = self.emulator.get_initial_state()
        total_episode_reward = 0.0
        mean_entropy = 0.0
        episode_start_step = 0

        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            local_step_start = self.local_step

            reset_game = False
            episode_over = False

            bonuses = deque(maxlen=100)
            rewards = list()
            states = list()
            actions = list()
            values = list()
            s_batch = list()
            a_batch = list()
            y_batch = list()
            adv_batch = list()

            while not (episode_over or (self.local_step - local_step_start
                                        == self.max_local_steps)):

                # Choose next action and execute it
                a, readout_v_t, readout_pi_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug(
                        'π_a={:.4f} / V={:.4f} / Mean Bonus={:.4f} / Max Bonus={:.4f}'
                        .format(readout_pi_t[a.argmax()], readout_v_t,
                                bonus_array.mean(), bonus_array.max()))

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)

                rewards.append(reward)
                states.append(s)
                actions.append(a)
                values.append(readout_v_t)

                s = new_s
                self.local_step += 1
                self.global_step.increment()

            # Calculate the value offered by critic in the new state.
            if episode_over:
                R = 0
            else:
                R = self.session.run(
                    self.local_network.output_layer_v,
                    feed_dict={self.local_network.input_ph: [new_s]})[0][0]

            sel_actions = []
            for i in reversed(xrange(len(states))):
                R = rewards[i] + self.gamma * R

                y_batch.append(R)
                a_batch.append(actions[i])
                s_batch.append(states[i])
                adv_batch.append(R - values[i])

                sel_actions.append(np.argmax(actions[i]))

            # Compute gradients on the local policy/V network and apply them to shared memory
            feed_dict = {
                self.local_network.input_ph: s_batch,
                self.local_network.critic_target_ph: y_batch,
                self.local_network.selected_action_ph: a_batch,
                self.local_network.adv_actor_ph: adv_batch,
            }
            grads, entropy = self.session.run(
                [self.local_network.get_gradients, self.local_network.entropy],
                feed_dict=feed_dict)

            self.apply_gradients_to_shared_memory_vars(grads)

            delta_old = local_step_start - episode_start_step
            delta_new = self.local_step - local_step_start
            mean_entropy = (mean_entropy * delta_old +
                            entropy * delta_new) / (delta_old + delta_new)

            s, mean_entropy, episode_start_step, total_episode_reward, _ = self.prepare_state(
                s, mean_entropy, episode_start_step, total_episode_reward,
                self.local_step, sel_actions, episode_over)
class PseudoCountQLearner(ValueBasedLearner):
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def generate_final_epsilon(self):
        return 0.1

    def _get_summary_vars(self):
        q_vars = super(PseudoCountQLearner, self)._get_summary_vars()

        bonus_q25 = tf.Variable(0., name='novelty_bonus_q25')
        s1 = tf.summary.scalar('Novelty_Bonus_q25_{}'.format(self.actor_id),
                               bonus_q25)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q75 = tf.Variable(0., name='novelty_bonus_q75')
        s3 = tf.summary.scalar('Novelty_Bonus_q75_{}'.format(self.actor_id),
                               bonus_q75)

        return q_vars + [bonus_q25, bonus_q50, bonus_q75]

    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses):
        # prevent the agent from getting stuck
        reset_game = False
        if (self.local_step - steps_at_last_reward > 5000
                or (self.emulator.get_lives() == 0
                    and self.emulator.game not in ONE_LIFE_GAMES)):

            steps_at_last_reward = self.local_step
            episode_over = True
            reset_game = True

        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value()
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()

            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))

            if self.is_master() and self.is_train:
                stats = [
                    total_episode_reward,
                    episode_ave_max_q,
                    self.epsilon,
                    np.percentile(bonuses, 25),
                    np.percentile(bonuses, 50),
                    np.percentile(bonuses, 75),
                ]
                feed_dict = {
                    self.summary_ph[i]: stats[i]
                    for i in range(len(stats))
                }
                res = self.session.run(self.update_ops + [self.summary_op],
                                       feed_dict=feed_dict)
                self.summary_writer.add_summary(res[-1],
                                                self.global_step.value())

            if reset_game or self.emulator.game in ONE_LIFE_GAMES:
                state = self.emulator.get_initial_state()

            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over

    def batch_update(self):
        if len(self.replay_memory) < self.batch_size:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        q_target_values = self.session.run(
            self.target_network.output_layer,
            feed_dict={self.target_network.input_ph: s_f})
        y_target = r_i + self.cts_eta * self.gamma * q_target_values.max(
            axis=1) * (1 - is_terminal.astype(np.int))

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)

        self.apply_gradients_to_shared_memory_vars(grads)

    def _run(self):
        """ Main actor learner loop for n-step Q learning. """
        if not self.is_train:
            return self.test()

        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = []
        a_batch = []
        y_batch = []
        bonuses = deque(maxlen=100)

        exec_update_target = False
        total_episode_reward = 0
        episode_ave_max_q = 0
        episode_over = False
        qmax_down = 0
        qmax_up = 0
        prev_qmax = -10 * 6
        low_qmax = 0
        ep_t = 0

        t0 = time.time()
        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            rewards = []
            states = []
            actions = []
            local_step_start = self.local_step

            while not episode_over:
                # Choose next action and execute it
                a, readout_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug(
                        'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.
                        format(bonus_array.mean(), bonus_array.max(),
                               100. / (time.time() - t0)))
                    t0 = time.time()

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += np.max(readout_t)

                global_step, update_target = self.global_step.increment(
                    self.q_target_update_steps)

                if update_target:
                    update_target = False
                    exec_update_target = True

                if self.local_step % 4 == 0:
                    self.batch_update()

                self.local_network.global_step = global_step

            else:
                mc_returns = list()
                running_total = 0.0
                for r in reversed(rewards):
                    running_total = r + self.gamma * running_total
                    mc_returns.insert(0, running_total)

                mixed_returns = self.cts_eta * np.array(rewards) + (
                    1 - self.cts_eta) * np.array(mc_returns)

                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(
                        (states[i], actions[i], mixed_returns[i],
                         states[i + 1], i + 1 == episode_length))

            if exec_update_target:
                self.update_target()
                exec_update_target = False
                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses)
Exemplo n.º 8
0
class PseudoCountQLearner(ValueBasedLearner):
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def generate_final_epsilon(self):
        return 0.1

    def batch_update(self):
        if len(self.replay_memory) < self.batch_size:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        q_target_values = self.session.run(
            self.target_network.output_layer,
            feed_dict={self.target_network.input_ph: s_f})
        y_target = r_i + self.cts_eta * self.gamma * q_target_values.max(
            axis=1) * (1 - is_terminal.astype(np.int))

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)

        self.apply_gradients_to_shared_memory_vars(grads)

    def _run(self):
        """ Main actor learner loop for n-step Q learning. """
        if not self.is_train:
            return self.test()

        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = []
        a_batch = []
        y_batch = []
        bonuses = deque(maxlen=100)

        exec_update_target = False
        total_episode_reward = 0
        episode_ave_max_q = 0
        episode_over = False
        qmax_down = 0
        qmax_up = 0
        prev_qmax = -10 * 6
        low_qmax = 0
        ep_t = 0

        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            rewards = []
            states = []
            actions = []
            local_step_start = self.local_step

            while not episode_over:
                # Choose next action and execute it
                a, readout_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug('Mean Bonus={:.4f} / Max Bonus={:.4f}'.format(
                        bonus_array.mean(), bonus_array.max()))

                # Rescale or clip immediate reward
                # reward = self.rescale_reward(self.rescale_reward(reward) + bonus)
                reward = self.rescale_reward(reward)
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += np.max(readout_t)

                global_step, update_target = self.global_step.increment(
                    self.q_target_update_steps)

                if update_target:
                    update_target = False
                    exec_update_target = True

                if self.local_step % 4 == 0:
                    self.batch_update()

                self.local_network.global_step = global_step

            else:
                mc_returns = list()
                running_total = 0.0
                for r in reversed(rewards):
                    running_total = r + self.gamma * running_total
                    mc_returns.insert(0, running_total)

                mixed_returns = self.cts_eta * np.array(rewards) + (
                    1 - self.cts_eta) * np.array(mc_returns)

                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(
                        (states[i], actions[i], mixed_returns[i],
                         states[i + 1], i + 1 == episode_length))

            if exec_update_target:
                self.update_target()
                exec_update_target = False
                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over)