def __init__(self, seed, state_dim, action_dim, lr=3e-4, gamma=0.99, tau=5e-3, batchsize=256, hidden_size=256, update_interval=1, buffer_size=int(1e6), target_entropy=None): self.gamma = gamma self.tau = tau self.target_entropy = target_entropy if target_entropy else -action_dim self.batchsize = batchsize self.update_interval = update_interval torch.manual_seed(seed) # aka critic self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # aka actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) # aka temperature self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.temp_optimizer = torch.optim.Adam([self.log_alpha], lr=lr) self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(1e6))
def __init__(self, seed, state_dim, action_dim, action_lim=1, lr=3e-4, gamma=0.99, tau=5e-3, batch_size=256, hidden_size=256, update_interval=2, buffer_size=1e6): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.update_interval = update_interval self.action_lim = action_lim torch.manual_seed(seed) # aka critic self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # aka actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(buffer_size)) self._seed = seed self._update_counter = 0
def reallocate_replay_pool(self, new_size: int) -> None: """Reset buffer Args: new_size (int): new maximum buffer size. """ assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length" new_replay_pool = ReplayPool(capacity=new_size) new_replay_pool.initialise(self.replay_pool) self.replay_pool = new_replay_pool
def train(self): pool = ReplayPool(max_pool_size=self.replay_pool_size, observation_dim=self._observation_dim, action_dim=self._action_dim) terminal = False observation = self.env.reset() path_length = 0 path_return = 0 itr = 0 for epoch in range(self.n_epoch): print('Starting epoch #%d' % epoch) for epoch_itr in range(self.epoch_length): if terminal: print(path_return, path_length) observation = self.env.reset() path_length = 0 path_return = 0 # if self.render: # self.env.render() action = self.policy.get_action(observation) next_observation, reward, terminal, _ = self.env.step(action) path_length += 1 path_return += reward if not terminal and path_length >= self.epoch_length: terminal = True pool.add_sample(observation, action, reward, terminal) observation = next_observation if pool.size >= self.min_pool_size: batch = pool.random_batch(self.batch_size) self._do_training(itr, batch) itr += 0
def __init__(self, seed: int, state_dim: int, action_dim: int, action_lim: int = 1, lr: float = 3e-4, gamma: float = 0.99, tau: float = 5e-3, batchsize: int = 256, hidden_size: int = 256, update_interval: int = 2, buffer_size: int = int(1e6), target_noise: float = 0.2, target_noise_clip: float = 0.5, explore_noise: float = 0.1, n_quantiles: int = 100, kappa: float = 1.0, beta: float = 0.0, bandit_lr: float = 0.1) -> None: """ Initialize DOPE agent. Args: seed (int): random seed state_dim (int): state dimension action_dim (int): action dimension action_lim (int, optional): max action value. Defaults to 1. lr (float, optional): learning rate. Defaults to 3e-4. gamma (float, optional): discount factor. Defaults to 0.99. tau (float, optional): mixing rate for target nets. Defaults to 5e-3. batchsize (int, optional): batch size. Defaults to 256. hidden_size (int, optional): hidden layer size for policy. Defaults to 256. update_interval (int, optional): delay for actor, target updates. Defaults to 2. buffer_size (int, optional): size of replay buffer. Defaults to int(1e6). target_noise (float, optional): smoothing noise for target action. Defaults to 0.2. target_noise_clip (float, optional): limit for target. Defaults to 0.5. explore_noise (float, optional): noise for exploration. Defaults to 0.1. n_quantiles (int, optional): number of quantiles. Defaults to 100. kappa (float, optional): constant for Huber loss. Defaults to 1.0. bandit_lr (float, optional): bandit learning rate. Defaults to 0.1. """ self.gamma = gamma self.tau = tau self.batchsize = batchsize self.update_interval = update_interval self.action_lim = action_lim self.target_noise = target_noise self.target_noise_clip = target_noise_clip self.explore_noise = explore_noise torch.manual_seed(seed) # init critic(s) self.q_funcs = QuantileDoubleQFunc(state_dim, action_dim, n_quantiles=n_quantiles, hidden_size=hidden_size).to(device) self.target_q_funcs = copy.deepcopy(self.q_funcs) self.target_q_funcs.eval() for p in self.target_q_funcs.parameters(): p.requires_grad = False # init actor self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device) self.target_policy = copy.deepcopy(self.policy) for p in self.target_policy.parameters(): p.requires_grad = False # set distributional parameters taus = torch.arange( 0, n_quantiles + 1, device=device, dtype=torch.float32) / n_quantiles self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, n_quantiles) self.n_quantiles = n_quantiles self.kappa = kappa # bandit top-down controller self.TDC = ExpWeights(arms=[-1, 0], lr=bandit_lr, init=0.0, use_std=True) # init optimizers self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr) self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.replay_pool = ReplayPool(capacity=int(buffer_size)) self._update_counter = 0
def reallocate_replay_pool(self, new_size: int): assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length" new_replay_pool = ReplayPool(capacity=new_size) new_replay_pool.initialise(self.replay_pool) self.replay_pool = new_replay_pool