def __init__(self, act_dim, obs_dim, n_post_action, obs_set_size, track_obs_set_unc_frequency, x_ph, a_ph, ac_kwargs, dropout_rate, logger_kwargs, tf_var_scope_main='main', tf_var_scope_target='target', tf_var_scope_rnd='random_net_distill'): self.act_dim = act_dim self.obs_dim = obs_dim self.n_post_action = n_post_action self.obs_set_size = obs_set_size self.obs_set_is_empty = True self.track_obs_set_unc_frequency = track_obs_set_unc_frequency self.tf_var_scope_main = tf_var_scope_main self.tf_var_scope_target = tf_var_scope_target self.tf_var_scope_rnd = tf_var_scope_rnd self.tf_var_scope_main_unc = 'main_uncertainty' self.tf_var_scope_target_unc = 'target_uncertainty' self.tf_var_scope_rnd_unc = 'rnd_uncertainty' # Create Actor-critic and RND to load weights for post sampling with tf.variable_scope(self.tf_var_scope_main_unc): self.x_ph = x_ph self.a_ph = a_ph # Actor-critic self.pi, _, self.pi_dropout_mask_generator, self.pi_dropout_mask_phs, \ self.q1, _, self.q1_dropout_mask_generator, self.q1_dropout_mask_phs, self.q1_pi, _, \ self.q2, _, self.q2_dropout_mask_generator, self.q2_dropout_mask_phs = mlp_actor_critic(x_ph, a_ph, **ac_kwargs, dropout_rate=dropout_rate) with tf.variable_scope(self.tf_var_scope_rnd_unc): # import pdb; pdb.set_trace() # Random Network Distillation self.rnd_targ_act, \ self.rnd_pred_act, _, \ self.rnd_pred_act_dropout_mask_generator, self.rnd_pred_act_dropout_mask_phs, \ self.rnd_targ_cri, \ self.rnd_pred_cri, _, \ self.rnd_pred_cri_dropout_mask_generator, self.rnd_pred_cri_dropout_mask_phs = random_net_distill(x_ph, a_ph, **ac_kwargs, dropout_rate=dropout_rate) self.dropout_masks_set_pi = self.pi_dropout_mask_generator.generate_dropout_mask(n_post_action) self.dropout_masks_set_q1 = self.q1_dropout_mask_generator.generate_dropout_mask(n_post_action) self.dropout_masks_set_q2 = self.q2_dropout_mask_generator.generate_dropout_mask(n_post_action) self.dropout_masks_set_rnd_act = self.rnd_pred_act_dropout_mask_generator.generate_dropout_mask(n_post_action) self.dropout_masks_set_rnd_cri = self.rnd_pred_cri_dropout_mask_generator.generate_dropout_mask(n_post_action) self.uncertainty_logger = Logger(output_fname='dropout_uncertainty.txt', **logger_kwargs) self.sample_logger = Logger(output_fname='dropout_sample_observation.txt', **logger_kwargs) self.delayed_dropout_masks_update = False self.delayed_dropout_masks_update_freq = 1000
def __init__(self, act_dim, obs_dim, n_post_action, obs_set_size, track_obs_set_unc_frequency, pi, x_ph, a_ph, pi_dropout_mask_phs, pi_dropout_mask_generator, rnd_targ_act, rnd_pred_act, rnd_targ_cri, rnd_pred_cri, logger_kwargs, tf_var_scope_main='main', tf_var_scope_target='target', tf_var_scope_unc='uncertainty', uncertainty_type='dropout'): self.act_dim = act_dim self.obs_dim = obs_dim self.n_post_action = n_post_action # policy self.pi = pi self.x_ph = x_ph self.a_ph = a_ph # dropout self.pi_dropout_mask_phs = pi_dropout_mask_phs self.pi_dropout_mask_generator = pi_dropout_mask_generator # rnd self.rnd_targ_act = rnd_targ_act self.rnd_pred_act = rnd_pred_act self.rnd_targ_cri = rnd_targ_cri self.rnd_pred_cri = rnd_pred_cri self.obs_set_size = obs_set_size self.obs_set_is_empty = True self.track_obs_set_unc_frequency = track_obs_set_unc_frequency self.tf_var_scope_main = tf_var_scope_main self.tf_var_scope_target = tf_var_scope_target self.tf_var_scope_unc = tf_var_scope_unc self.uncertainty_logger = Logger( output_fname='{}_uncertainty.txt'.format(uncertainty_type), **logger_kwargs) self.sample_logger = Logger( output_fname='{}_sample_observation.txt'.format(uncertainty_type), **logger_kwargs)
def __init__(self, memory_length, input_dim=1, output_dim=1, hidden_sizes=[32], kernel_initializer='glorot_uniform', bias_initializer='zeros', hidden_activation=tf.keras.activations.relu, output_activation=tf.keras.activations.linear, logger_kwargs=None, loger_file_name='learning_progress_log.txt'): self.input_dim = input_dim self.output_dim = output_dim self.memory_length = memory_length self.memory_track_models = deque(maxlen=self.memory_length) self.memory_track_outputs = deque(maxlen=self.memory_length) # Define model holders self.input_ph = tf.placeholder(dtype=tf.float32, shape=(None, self.input_dim)) for m_i in range(self.memory_length): self.memory_track_models.append( MLP(hidden_sizes + [output_dim], hidden_activation=hidden_activation, output_activation=output_activation)) self.memory_track_outputs.append(self.memory_track_models[m_i]( self.input_ph)) # Define logger self.lp_logger = Logger(output_fname=loger_file_name, **logger_kwargs)
def __init__(self, obs_dim, act_dim, size, logger_fname='experiences_log.txt', **logger_kwargs): # ExperienceLogger: save experiences for supervised learning logger_kwargs['output_fname'] = logger_fname self.experience_logger = Logger(**logger_kwargs) self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) self.rews_buf = np.zeros(size, dtype=np.float32) self.done_buf = np.zeros(size, dtype=np.float32) self.ptr, self.size, self.max_size = 0, 0, size
def play_game(env, torch_load_kwargs={}, actor_critic=CNNCritic, episodes=10, render=False, logger_kwargs={}): logger = Logger(**logger_kwargs) logger.save_config(locals()) ac = actor_critic(env.observation_space, env.action_space) # model saved on GPU, load on CPU: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-across-devices ac_saved = torch.load(**torch_load_kwargs) ac_saved = ac_saved.to(device) ac.q.load_state_dict(ac_saved.q.module.state_dict()) ac.q.to(device) avg_ret = 0 avg_raw_ret = 0 game = 0 for ep in range(episodes): o, ep_ret, ep_len, d, raw_ret = env.reset(), 0, 0, False, 0 while not d: if render: env.render() o = torch.as_tensor(o, dtype=torch.float32, device=device) o2, r, d, info = env.step(ac.act(o)) ep_ret += r ep_len += 1 o = o2 print(f'Returns for episode {ep}: {ep_ret}') avg_ret += (1. / (ep + 1)) * (ep_ret - avg_ret) lives = info.get('ale.lives') if lives is not None and lives == 0: raw_rew = env.get_episode_rewards()[-1] raw_len = env.get_episode_lengths()[-1] logger.log_tabular('RawRet', raw_rew) logger.log_tabular('RawLen', raw_len) logger.log_tabular('GameId', game) wandb.log(logger.log_current_row) logger.dump_tabular() game += 1 print('Average raw returns:', np.mean(env.get_episode_rewards())) print(f'Avg returns={avg_ret} over {episodes} episodes') env.close()