def test_extend_uniform(): nvals = 16 states = [np.random.rand(2, 2) for _ in range(nvals)] actions = [np.random.rand(2) for _ in range(nvals)] rewards = [np.random.rand() for _ in range(nvals)] newstate = [np.random.rand(2, 2) for _ in range(nvals)] done = [np.random.randint(0, 2) for _ in range(nvals)] size = 32 baseline = ReplayBuffer(size) ext = ReplayBuffer(size) for data in zip(states, actions, rewards, newstate, done): baseline.add(*data) states, actions, rewards, newstates, done = map( np.array, [states, actions, rewards, newstate, done]) ext.extend(states, actions, rewards, newstates, done) assert len(baseline) == len(ext) # Check buffers have same values for i in range(nvals): for j in range(5): condition = (baseline.storage[i][j] == ext.storage[i][j]) if isinstance(condition, np.ndarray): # for obs, obs_t1 assert np.all(condition) else: # for done, reward action assert condition
class CuriosityWrapper(BaseTFWrapper): """ Random Network Distillation (RND) curiosity reward. https://arxiv.org/abs/1810.12894 :param env: (gym.Env) Environment to wrap. :param network: (str) Network type. Can be a "cnn" or a "mlp". :param intrinsic_reward_weight: (float) Weight for the intrinsic reward. :param buffer_size: (int) Size of the replay buffer for predictor training. :param train_freq: (int) Frequency of predictor training in steps. :param gradient_steps: (int) Number of optimization epochs. :param batch_size: (int) Number of samples to draw from the replay buffer per optimization epoch. :param learning_starts: (int) Number of steps to wait before training the predictor for the first time. :param filter_end_of_episode: (bool) Weather or not to filter end of episode signals (dones). :param filter_reward: (bool) Weather or not to filter extrinsic reward from the environment. :param norm_obs: (bool) Weather or not to normalize and clip obs for the target/predictor network. Note that obs returned will be unaffected. :param norm_ext_reward: (bool) Weather or not to normalize extrinsic reward. :param gamma: (float) Reward discount factor for intrinsic reward normalization. :param learning_rate: (float) Learning rate for the Adam optimizer of the predictor network. """ def __init__(self, env, network: str = "cnn", intrinsic_reward_weight: float = 1.0, buffer_size: int = 65536, train_freq: int = 16384, gradient_steps: int = 4, batch_size: int = 4096, learning_starts: int = 100, filter_end_of_episode: bool = True, filter_reward: bool = False, norm_obs: bool = True, norm_ext_reward: bool = True, gamma: float = 0.99, learning_rate: float = 0.0001, training: bool = True, _init_setup_model=True): super().__init__(env, _init_setup_model) self.network_type = network self.buffer = ReplayBuffer(buffer_size) self.train_freq = train_freq self.gradient_steps = gradient_steps self.batch_size = batch_size self.learning_starts = learning_starts self.intrinsic_reward_weight = intrinsic_reward_weight self.filter_end_of_episode = filter_end_of_episode self.filter_extrinsic_reward = filter_reward self.clip_obs = 5 self.norm_obs = norm_obs self.norm_ext_reward = norm_ext_reward self.gamma = gamma self.learning_rate = learning_rate self.training = training self.epsilon = 1e-8 self.int_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon) self.ext_rwd_rms = RunningMeanStd(shape=(), epsilon=self.epsilon) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.int_ret = np.zeros( self.num_envs) # discounted return for intrinsic reward self.ext_ret = np.zeros( self.num_envs) # discounted return for extrinsic reward self.updates = 0 self.steps = 0 self.last_action = None self.last_obs = None self.last_update = 0 self.graph = None self.sess = None self.observation_ph = None self.processed_obs = None self.predictor_network = None self.target_network = None self.params = None self.int_reward = None self.aux_loss = None self.optimizer = None self.training_op = None if _init_setup_model: self.setup_model() def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=None, graph=self.graph) self.observation_ph, self.processed_obs = observation_input( self.venv.observation_space, scale=(self.network_type == "cnn")) with tf.variable_scope("target_model"): if self.network_type == 'cnn': self.target_network = small_convnet( self.processed_obs, tf.nn.leaky_relu) elif self.network_type == 'mlp': self.target_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.target_network = tf_layers.linear( self.target_network, "out", 512) else: raise ValueError("Unknown network type {}!".format( self.network_type)) with tf.variable_scope("predictor_model"): if self.network_type == 'cnn': self.predictor_network = tf.nn.relu( small_convnet(self.processed_obs, tf.nn.leaky_relu)) elif self.network_type == 'mlp': self.predictor_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.predictor_network = tf.nn.relu( tf_layers.linear(self.predictor_network, "pred_fc1", 512)) self.predictor_network = tf_layers.linear( self.predictor_network, "out", 512) with tf.name_scope("loss"): self.int_reward = tf.reduce_mean(tf.square( tf.stop_gradient(self.target_network) - self.predictor_network), axis=1) self.aux_loss = tf.reduce_mean( tf.square( tf.stop_gradient(self.target_network) - self.predictor_network)) with tf.name_scope("train"): self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.training_op = self.optimizer.minimize(self.aux_loss) self.params = tf.trainable_variables() tf.global_variables_initializer().run(session=self.sess) def reset(self): obs = self.venv.reset() self.last_obs = obs return obs def step_async(self, actions): super().step_async(actions) self.last_action = actions self.steps += self.num_envs def step_wait(self): obs, rews, dones, infos = self.venv.step_wait() self.buffer.extend(self.last_obs, self.last_action, rews, obs, dones) if self.filter_extrinsic_reward: rews = np.zeros(rews.shape) if self.filter_end_of_episode: dones = np.zeros(dones.shape) if self.training: self.obs_rms.update(obs) obs_n = self.normalize_obs(obs) loss = self.sess.run([self.int_reward], {self.observation_ph: obs_n}) if self.training: self._update_ext_reward_rms(rews) self._update_int_reward_rms(loss) intrinsic_reward = np.array(loss) / np.sqrt(self.int_rwd_rms.var + self.epsilon) if self.norm_ext_reward: extrinsic_reward = np.array(rews) / np.sqrt(self.ext_rwd_rms.var + self.epsilon) else: extrinsic_reward = rews reward = np.squeeze(extrinsic_reward + self.intrinsic_reward_weight * intrinsic_reward) if self.training and self.steps > self.learning_starts and self.steps - self.last_update > self.train_freq: self.updates += 1 self.last_update = self.steps self.learn() return obs, reward, dones, infos def close(self): VecEnvWrapper.close(self) def learn(self): total_loss = 0 for _ in range(self.gradient_steps): obs_batch, act_batch, rews_batch, next_obs_batch, done_mask = self.buffer.sample( self.batch_size) obs_batch = self.normalize_obs(obs_batch) test = self.sess.run(self.aux_loss, {self.observation_ph: obs_batch}) train, loss = self.sess.run([self.training_op, self.aux_loss], {self.observation_ph: obs_batch}) total_loss += loss logging.info("Trained predictor. Avg loss: {}".format( total_loss / self.gradient_steps)) def _update_int_reward_rms(self, reward: np.ndarray) -> None: """Update reward normalization statistics.""" self.int_ret = self.gamma * self.int_ret + reward self.int_rwd_rms.update(self.int_ret) def _update_ext_reward_rms(self, reward: np.ndarray) -> None: """Update reward normalization statistics.""" self.ext_ret = self.gamma * self.ext_ret + reward self.ext_rwd_rms.update(self.ext_ret) def normalize_obs(self, obs: np.ndarray) -> np.ndarray: """ Normalize observations using observations statistics. Calling this method does not update statistics. """ if self.norm_obs: obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs def get_parameter_list(self): return self.params def save(self, save_path): #os.makedirs(os.path.dirname(save_path), exist_ok=True) #self.saver.save(self.sess, save_path) data = { 'network': self.network_type, 'intrinsic_reward_weight': self.intrinsic_reward_weight, 'buffer_size': self.buffer.buffer_size, 'train_freq': self.train_freq, 'gradient_steps': self.gradient_steps, 'batch_size': self.batch_size, 'learning_starts': self.learning_starts, 'filter_end_of_episode': self.filter_end_of_episode, 'filter_extrinsic_reward': self.filter_extrinsic_reward, 'norm_obs': self.norm_obs, 'norm_ext_reward': self.norm_ext_reward, 'gamma': self.gamma, 'learning_rate': self.learning_rate, 'int_rwd_rms': self.int_rwd_rms, 'ext_rwd_rms': self.ext_rwd_rms, 'obs_rms': self.obs_rms } params_to_save = self.get_parameters() self._save_to_file_zip(save_path, data=data, params=params_to_save)