def __init__(self, act_dim, obs_dim, n_post_action,
                 obs_set_size, track_obs_set_unc_frequency,
                 x_ph, a_ph, ac_kwargs, dropout_rate,
                 logger_kwargs,
                 tf_var_scope_main='main', tf_var_scope_target='target',
                 tf_var_scope_rnd='random_net_distill'):

        self.act_dim = act_dim
        self.obs_dim = obs_dim
        self.n_post_action = n_post_action

        self.obs_set_size = obs_set_size
        self.obs_set_is_empty = True
        self.track_obs_set_unc_frequency = track_obs_set_unc_frequency

        self.tf_var_scope_main = tf_var_scope_main
        self.tf_var_scope_target = tf_var_scope_target
        self.tf_var_scope_rnd = tf_var_scope_rnd
        self.tf_var_scope_main_unc = 'main_uncertainty'
        self.tf_var_scope_target_unc = 'target_uncertainty'
        self.tf_var_scope_rnd_unc = 'rnd_uncertainty'

        # Create Actor-critic and RND to load weights for post sampling
        with tf.variable_scope(self.tf_var_scope_main_unc):
            self.x_ph = x_ph
            self.a_ph = a_ph
            # Actor-critic
            self.pi, _, self.pi_dropout_mask_generator, self.pi_dropout_mask_phs, \
            self.q1, _, self.q1_dropout_mask_generator, self.q1_dropout_mask_phs, self.q1_pi, _, \
            self.q2, _, self.q2_dropout_mask_generator, self.q2_dropout_mask_phs = mlp_actor_critic(x_ph, a_ph, **ac_kwargs,
                                                                                                    dropout_rate=dropout_rate)
        with tf.variable_scope(self.tf_var_scope_rnd_unc):
            # import pdb; pdb.set_trace()
            # Random Network Distillation
            self.rnd_targ_act, \
            self.rnd_pred_act, _, \
            self.rnd_pred_act_dropout_mask_generator, self.rnd_pred_act_dropout_mask_phs, \
            self.rnd_targ_cri, \
            self.rnd_pred_cri, _, \
            self.rnd_pred_cri_dropout_mask_generator, self.rnd_pred_cri_dropout_mask_phs = random_net_distill(x_ph, a_ph,
                                                                                                              **ac_kwargs,
                                                                                                              dropout_rate=dropout_rate)
        self.dropout_masks_set_pi = self.pi_dropout_mask_generator.generate_dropout_mask(n_post_action)
        self.dropout_masks_set_q1 = self.q1_dropout_mask_generator.generate_dropout_mask(n_post_action)
        self.dropout_masks_set_q2 = self.q2_dropout_mask_generator.generate_dropout_mask(n_post_action)
        self.dropout_masks_set_rnd_act = self.rnd_pred_act_dropout_mask_generator.generate_dropout_mask(n_post_action)
        self.dropout_masks_set_rnd_cri = self.rnd_pred_cri_dropout_mask_generator.generate_dropout_mask(n_post_action)



        self.uncertainty_logger = Logger(output_fname='dropout_uncertainty.txt',
                                         **logger_kwargs)
        self.sample_logger = Logger(output_fname='dropout_sample_observation.txt',
                                    **logger_kwargs)

        self.delayed_dropout_masks_update = False
        self.delayed_dropout_masks_update_freq = 1000
    def __init__(self,
                 act_dim,
                 obs_dim,
                 n_post_action,
                 obs_set_size,
                 track_obs_set_unc_frequency,
                 pi,
                 x_ph,
                 a_ph,
                 pi_dropout_mask_phs,
                 pi_dropout_mask_generator,
                 rnd_targ_act,
                 rnd_pred_act,
                 rnd_targ_cri,
                 rnd_pred_cri,
                 logger_kwargs,
                 tf_var_scope_main='main',
                 tf_var_scope_target='target',
                 tf_var_scope_unc='uncertainty',
                 uncertainty_type='dropout'):
        self.act_dim = act_dim
        self.obs_dim = obs_dim
        self.n_post_action = n_post_action
        # policy
        self.pi = pi
        self.x_ph = x_ph
        self.a_ph = a_ph
        # dropout
        self.pi_dropout_mask_phs = pi_dropout_mask_phs
        self.pi_dropout_mask_generator = pi_dropout_mask_generator
        # rnd
        self.rnd_targ_act = rnd_targ_act
        self.rnd_pred_act = rnd_pred_act
        self.rnd_targ_cri = rnd_targ_cri
        self.rnd_pred_cri = rnd_pred_cri

        self.obs_set_size = obs_set_size
        self.obs_set_is_empty = True
        self.track_obs_set_unc_frequency = track_obs_set_unc_frequency

        self.tf_var_scope_main = tf_var_scope_main
        self.tf_var_scope_target = tf_var_scope_target
        self.tf_var_scope_unc = tf_var_scope_unc

        self.uncertainty_logger = Logger(
            output_fname='{}_uncertainty.txt'.format(uncertainty_type),
            **logger_kwargs)
        self.sample_logger = Logger(
            output_fname='{}_sample_observation.txt'.format(uncertainty_type),
            **logger_kwargs)
예제 #3
0
    def __init__(self,
                 memory_length,
                 input_dim=1,
                 output_dim=1,
                 hidden_sizes=[32],
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 hidden_activation=tf.keras.activations.relu,
                 output_activation=tf.keras.activations.linear,
                 logger_kwargs=None,
                 loger_file_name='learning_progress_log.txt'):
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.memory_length = memory_length
        self.memory_track_models = deque(maxlen=self.memory_length)
        self.memory_track_outputs = deque(maxlen=self.memory_length)
        # Define model holders
        self.input_ph = tf.placeholder(dtype=tf.float32,
                                       shape=(None, self.input_dim))
        for m_i in range(self.memory_length):
            self.memory_track_models.append(
                MLP(hidden_sizes + [output_dim],
                    hidden_activation=hidden_activation,
                    output_activation=output_activation))
            self.memory_track_outputs.append(self.memory_track_models[m_i](
                self.input_ph))
        # Define logger
        self.lp_logger = Logger(output_fname=loger_file_name, **logger_kwargs)
예제 #4
0
 def __init__(self, obs_dim, act_dim, size,
              logger_fname='experiences_log.txt', **logger_kwargs):
     # ExperienceLogger: save experiences for supervised learning
     logger_kwargs['output_fname'] = logger_fname
     self.experience_logger = Logger(**logger_kwargs)
     self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
     self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
     self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
     self.rews_buf = np.zeros(size, dtype=np.float32)
     self.done_buf = np.zeros(size, dtype=np.float32)
     self.ptr, self.size, self.max_size = 0, 0, size
def play_game(env,
              torch_load_kwargs={},
              actor_critic=CNNCritic,
              episodes=10,
              render=False,
              logger_kwargs={}):

    logger = Logger(**logger_kwargs)
    logger.save_config(locals())

    ac = actor_critic(env.observation_space, env.action_space)

    # model saved on GPU, load on CPU: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-across-devices
    ac_saved = torch.load(**torch_load_kwargs)
    ac_saved = ac_saved.to(device)
    ac.q.load_state_dict(ac_saved.q.module.state_dict())
    ac.q.to(device)

    avg_ret = 0
    avg_raw_ret = 0
    game = 0

    for ep in range(episodes):
        o, ep_ret, ep_len, d, raw_ret = env.reset(), 0, 0, False, 0
        while not d:
            if render:
                env.render()
            o = torch.as_tensor(o, dtype=torch.float32, device=device)
            o2, r, d, info = env.step(ac.act(o))
            ep_ret += r
            ep_len += 1
            o = o2

        print(f'Returns for episode {ep}: {ep_ret}')
        avg_ret += (1. / (ep + 1)) * (ep_ret - avg_ret)

        lives = info.get('ale.lives')
        if lives is not None and lives == 0:
            raw_rew = env.get_episode_rewards()[-1]
            raw_len = env.get_episode_lengths()[-1]
            logger.log_tabular('RawRet', raw_rew)
            logger.log_tabular('RawLen', raw_len)
            logger.log_tabular('GameId', game)
            wandb.log(logger.log_current_row)
            logger.dump_tabular()
            game += 1

    print('Average raw returns:', np.mean(env.get_episode_rewards()))
    print(f'Avg returns={avg_ret} over {episodes} episodes')
    env.close()