示例#1
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGBulletNetworkSURND(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.motivation = MetaCriticRNDMotivation(self.network.metacritic_model, config.motivation_lr, config.motivation_variant, config.motivation_eta, self.memory, config.motivation_batch_size,
                                               config.device)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
示例#2
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGAerisNetworkFIM(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.motivation = ForwardInverseModelMotivation(self.network.forward_model, config.forward_model_lr, self.network.inverse_model, config.forward_model_lr,
                                                     0.5, config.forward_model_eta,
                                                     config.forward_model_variant, self.memory, config.forward_model_batch_size, config.device)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
示例#3
0
    def __init__(self, critic, memory_size, sample_size, critic_lr, gamma, weight_decay=0):
        self._critic = critic
        self._critic_target = copy.deepcopy(critic)
        self._memory = ExperienceReplayBuffer(memory_size)
        self._sample_size = sample_size
        self._gamma = gamma
        self._update_step = 0

        self._hard_update(self._critic_target, self._critic)
        self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=critic_lr, weight_decay=weight_decay)
示例#4
0
class DDPGBulletAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGBulletNetwork(state_dim, action_dim, config)
        self.memory = ExperienceReplayBuffer(config.memory_size)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)

    def train(self, state0, action0, state1, reward, mask):
        self.memory.add(state0, action0, state1, reward, mask)
        indices = self.memory.indices(self.config.batch_size)
        self.algorithm.train_sample(indices)
示例#5
0
class DDPGBulletRNDModelAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGBulletNetworkRND(state_dim, action_dim, config)
        self.memory = ExperienceReplayBuffer(config.memory_size)
        self.motivation = RNDMotivation(self.network.rnd_model, config.motivation_lr, config.motivation_eta, self.memory, config.motivation_batch_size, config.device)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)

    def train(self, state0, action0, state1, reward, mask):
        self.memory.add(state0, action0, state1, reward, mask)
        self.algorithm.train_sample(self.memory.indices(self.config.batch_size))
        self.motivation.train(self.memory.indices(self.config.motivation_batch_size))
示例#6
0
class DDPGAerisM2SModelAgent(DDPGAgent):
    def __init__(self, state_dim, action_dim, config):
        super().__init__(state_dim, action_dim, config)
        self.network = DDPGAerisNetworkFM(state_dim, action_dim, config)
        self.memory = ExperienceReplayBuffer(config.memory_size)
        self.motivation = M2SMotivation(self.network, config.forward_model_lr, config.forward_model_eta, self.memory, config.forward_model_batch_size, config.steps * 1e6)
        self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)

    def train(self, state0, action0, state1, reward, mask):
        self.memory.add(state0, action0, state1, reward, mask)
        self.algorithm.train_sample(self.memory.indices(self.config.batch_size))
        self.motivation.train(self.memory.indices(self.config.forward_model_batch_size))
示例#7
0
class DQN:
    def __init__(self, critic, memory_size, sample_size, critic_lr, gamma, weight_decay=0):
        self._critic = critic
        self._critic_target = copy.deepcopy(critic)
        self._memory = ExperienceReplayBuffer(memory_size)
        self._sample_size = sample_size
        self._gamma = gamma
        self._update_step = 0

        self._hard_update(self._critic_target, self._critic)
        self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=critic_lr, weight_decay=weight_decay)

    def get_action(self, state):
        return self.activate(state).argmax(0).item()

    def train(self, state0, action0, state1, reward, done):
        self._memory.add(state0, action0, state1, reward, done)

        if len(self._memory) > self._sample_size:
            sample = self._memory.sample(self._sample_size)

            states = torch.stack(sample.state)
            next_states = torch.stack(sample.next_state)
            actions = torch.Tensor(sample.action).long().unsqueeze(1)
            rewards = torch.Tensor(sample.reward)
            masks = torch.Tensor(sample.mask)

            Qs0a = self._critic(states).gather(1, actions).squeeze()
            Qs1max = self._critic_target(next_states).max(1)[0]
            target = rewards + masks * self._gamma * Qs1max

            loss = torch.nn.functional.mse_loss(Qs0a, target.detach())
            self._critic_optimizer.zero_grad()
            loss.backward()
            self._critic_optimizer.step()

            self._update_step += 1
            if self._update_step == 100:
                self._update_step = 0
                self._hard_update(self._critic_target, self._critic)

    def activate(self, state):
        return self._critic(state)

    def _hard_update(self, target, source):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)
示例#8
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGBulletNetwork(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)
示例#9
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGAerisNetworkQRND(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.motivation = QRNDMotivation(self.network.qrnd_model, config.forward_model_lr, config.forward_model_eta, self.memory)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size, self.motivation)
示例#10
0
 def __init__(self, state_dim, action_dim, config):
     super().__init__(state_dim, action_dim, config)
     self.network = DDPGBulletNetworkDOPSimple(state_dim, action_dim, config)
     self.memory = ExperienceReplayBuffer(config.memory_size)
     self.motivation = DOPSimpleMotivation(self.network.dop_model, config.motivation_lr, config.motivation_eta, self.memory, config.device)
     self.algorithm = DDPG(self.network, config.actor_lr, config.critic_lr, config.gamma, config.tau, self.memory, config.batch_size)