def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkSURND(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = MetaCriticRNDMotivation(self.network.metacritic_model, config.motivation_lr, config.motivation_variant, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkFIM(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = ForwardInverseModelMotivation(self.network.forward_model, config.forward_model_lr, self.network.inverse_model, config.forward_model_lr, 0.5, config.forward_model_eta, config.forward_model_variant, self.memory, config.forward_model_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
def __init__(self, critic, memory_size, sample_size, critic_lr, gamma, weight_decay=0): self._critic = critic self._critic_target = copy.deepcopy(critic) self._memory = ExperienceReplayBuffer(memory_size) self._sample_size = sample_size self._gamma = gamma self._update_step = 0 self._hard_update(self._critic_target, self._critic) self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=critic_lr, weight_decay=weight_decay)
class DDPGBulletAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetwork(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) indices = self.memory.indices(self.config.batch_size) self.algorithm.train_sample(indices)
class DDPGBulletRNDModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkRND(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = RNDMotivation(self.network.rnd_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
class DDPGAerisM2SModelAgent(DDPGAgent): def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkFM(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = M2SMotivation(self.network, config.forward_model_lr, config.forward_model_eta, self.memory, config.forward_model_batch_size, config.steps * 1e6) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation) def train(self, state0, action0, state1, reward, mask): self.memory.add(state0, action0, state1, reward, mask) self.algorithm.train_sample(self.memory.indices(self.config.batch_size)) self.motivation.train(self.memory.indices(self.config.forward_model_batch_size))
class DQN: def __init__(self, critic, memory_size, sample_size, critic_lr, gamma, weight_decay=0): self._critic = critic self._critic_target = copy.deepcopy(critic) self._memory = ExperienceReplayBuffer(memory_size) self._sample_size = sample_size self._gamma = gamma self._update_step = 0 self._hard_update(self._critic_target, self._critic) self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=critic_lr, weight_decay=weight_decay) def get_action(self, state): return self.activate(state).argmax(0).item() def train(self, state0, action0, state1, reward, done): self._memory.add(state0, action0, state1, reward, done) if len(self._memory) > self._sample_size: sample = self._memory.sample(self._sample_size) states = torch.stack(sample.state) next_states = torch.stack(sample.next_state) actions = torch.Tensor(sample.action).long().unsqueeze(1) rewards = torch.Tensor(sample.reward) masks = torch.Tensor(sample.mask) Qs0a = self._critic(states).gather(1, actions).squeeze() Qs1max = self._critic_target(next_states).max(1)[0] target = rewards + masks * self._gamma * Qs1max loss = torch.nn.functional.mse_loss(Qs0a, target.detach()) self._critic_optimizer.zero_grad() loss.backward() self._critic_optimizer.step() self._update_step += 1 if self._update_step == 100: self._update_step = 0 self._hard_update(self._critic_target, self._critic) def activate(self, state): return self._critic(state) def _hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetwork(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGAerisNetworkQRND(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = QRNDMotivation(self.network.qrnd_model, config.forward_model_lr, config.forward_model_eta, self.memory) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
def __init__(self, state_dim, action_dim, config): super().__init__(state_dim, action_dim, config) self.network = DDPGBulletNetworkDOPSimple(state_dim, action_dim, config) self.memory = ExperienceReplayBuffer(config.memory_size) self.motivation = DOPSimpleMotivation(self.network.dop_model, config.motivation_lr, config.motivation_eta, self.memory, config.device) self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)