class ReplayActor(object): """A replay buffer shard. Ray actors are single-threaded, so for scalability multiple replay actors may be created to increase parallelism.""" def __init__(self, num_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=prioritized_replay_alpha) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() def get_host(self): return os.uname()[1] def add_batch(self, batch): PolicyOptimizer._check_not_multiagent(batch) with self.add_batch_timer: for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) return batch def update_priorities(self, batch_indexes, td_errors): with self.update_priorities_timer: new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch_indexes, new_priorities) def stats(self): stat = { "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3), "replay_time_ms": round(1000 * self.replay_timer.mean, 3), "update_priorities_time_ms": round(1000 * self.update_priorities_timer.mean, 3), } stat.update(self.replay_buffer.stats()) return stat
class ReplayActor(object): def __init__(self, num_shards, learning_starts, buffer_size, train_batch_size, prioritized_replay_alpha, prioritized_replay_beta, prioritized_replay_eps): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size self.prioritized_replay_beta = prioritized_replay_beta self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # Metrics self.add_batch_timer = TimerStat() self.replay_timer = TimerStat() self.update_priorities_timer = TimerStat() def get_host(self): return os.uname()[1] def add_batch(self, batch): with self.add_batch_timer: for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) return batch def update_priorities(self, batch, td_errors): with self.update_priorities_timer: new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch["batch_indexes"], new_priorities) def stats(self): stat = { "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3), "replay_time_ms": round(1000 * self.replay_timer.mean, 3), "update_priorities_time_ms": round(1000 * self.update_priorities_timer.mean, 3), } stat.update(self.replay_buffer.stats()) return stat