예제 #1
0
    def __init__(self, seed, state_dim, action_dim, lr=3e-4, gamma=0.99, tau=5e-3, batchsize=256, hidden_size=256, update_interval=1, buffer_size=int(1e6), target_entropy=None):
        self.gamma = gamma
        self.tau = tau
        self.target_entropy = target_entropy if target_entropy else -action_dim
        self.batchsize = batchsize
        self.update_interval = update_interval

        torch.manual_seed(seed)

        # aka critic
        self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # aka actor
        self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device)

        # aka temperature
        self.log_alpha = torch.zeros(1, requires_grad=True, device=device)

        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)
        self.temp_optimizer = torch.optim.Adam([self.log_alpha], lr=lr)

        self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(1e6))
예제 #2
0
    def __init__(self, seed, state_dim, action_dim,
                 action_lim=1, lr=3e-4, gamma=0.99,
                 tau=5e-3, batch_size=256, hidden_size=256,
                 update_interval=2, buffer_size=1e6):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.update_interval = update_interval
        self.action_lim = action_lim

        torch.manual_seed(seed)

        # aka critic
        self.q_funcs = DoubleQFunc(state_dim, action_dim, hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # aka actor
        self.policy = Policy(state_dim, action_dim, hidden_size=hidden_size).to(device)

        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

        self.replay_pool = ReplayPool(action_dim=action_dim, state_dim=state_dim, capacity=int(buffer_size))

        self._seed = seed

        self._update_counter = 0
예제 #3
0
파일: dope.py 프로젝트: jxzhangjhu/DOPE
    def reallocate_replay_pool(self, new_size: int) -> None:
        """Reset buffer

        Args:
            new_size (int): new maximum buffer size. 
        """
        assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length"
        new_replay_pool = ReplayPool(capacity=new_size)
        new_replay_pool.initialise(self.replay_pool)
        self.replay_pool = new_replay_pool
예제 #4
0
    def train(self):
        pool = ReplayPool(max_pool_size=self.replay_pool_size,
                          observation_dim=self._observation_dim,
                          action_dim=self._action_dim)

        terminal = False
        observation = self.env.reset()
        path_length = 0
        path_return = 0
        itr = 0

        for epoch in range(self.n_epoch):
            print('Starting epoch #%d' % epoch)
            for epoch_itr in range(self.epoch_length):

                if terminal:
                    print(path_return, path_length)
                    observation = self.env.reset()
                    path_length = 0
                    path_return = 0
                # if self.render:
                #     self.env.render()

                action = self.policy.get_action(observation)
                next_observation, reward, terminal, _ = self.env.step(action)
                path_length += 1
                path_return += reward

                if not terminal and path_length >= self.epoch_length:
                    terminal = True

                pool.add_sample(observation, action, reward, terminal)
                observation = next_observation

                if pool.size >= self.min_pool_size:
                    batch = pool.random_batch(self.batch_size)
                    self._do_training(itr, batch)

                itr += 0
예제 #5
0
파일: dope.py 프로젝트: jxzhangjhu/DOPE
    def __init__(self,
                 seed: int,
                 state_dim: int,
                 action_dim: int,
                 action_lim: int = 1,
                 lr: float = 3e-4,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 batchsize: int = 256,
                 hidden_size: int = 256,
                 update_interval: int = 2,
                 buffer_size: int = int(1e6),
                 target_noise: float = 0.2,
                 target_noise_clip: float = 0.5,
                 explore_noise: float = 0.1,
                 n_quantiles: int = 100,
                 kappa: float = 1.0,
                 beta: float = 0.0,
                 bandit_lr: float = 0.1) -> None:
        """
        Initialize DOPE agent. 

        Args:
            seed (int): random seed
            state_dim (int): state dimension
            action_dim (int): action dimension
            action_lim (int, optional): max action value. Defaults to 1.
            lr (float, optional): learning rate. Defaults to 3e-4.
            gamma (float, optional): discount factor. Defaults to 0.99.
            tau (float, optional): mixing rate for target nets. Defaults to 5e-3.
            batchsize (int, optional): batch size. Defaults to 256.
            hidden_size (int, optional): hidden layer size for policy. Defaults to 256.
            update_interval (int, optional): delay for actor, target updates. Defaults to 2.
            buffer_size (int, optional): size of replay buffer. Defaults to int(1e6).
            target_noise (float, optional): smoothing noise for target action. Defaults to 0.2.
            target_noise_clip (float, optional): limit for target. Defaults to 0.5.
            explore_noise (float, optional): noise for exploration. Defaults to 0.1.
            n_quantiles (int, optional): number of quantiles. Defaults to 100.
            kappa (float, optional): constant for Huber loss. Defaults to 1.0.
            bandit_lr (float, optional): bandit learning rate. Defaults to 0.1.
        """
        self.gamma = gamma
        self.tau = tau
        self.batchsize = batchsize
        self.update_interval = update_interval
        self.action_lim = action_lim

        self.target_noise = target_noise
        self.target_noise_clip = target_noise_clip
        self.explore_noise = explore_noise

        torch.manual_seed(seed)

        # init critic(s)
        self.q_funcs = QuantileDoubleQFunc(state_dim,
                                           action_dim,
                                           n_quantiles=n_quantiles,
                                           hidden_size=hidden_size).to(device)
        self.target_q_funcs = copy.deepcopy(self.q_funcs)
        self.target_q_funcs.eval()
        for p in self.target_q_funcs.parameters():
            p.requires_grad = False

        # init actor
        self.policy = Policy(state_dim, action_dim,
                             hidden_size=hidden_size).to(device)
        self.target_policy = copy.deepcopy(self.policy)
        for p in self.target_policy.parameters():
            p.requires_grad = False

        # set distributional parameters
        taus = torch.arange(
            0, n_quantiles + 1, device=device,
            dtype=torch.float32) / n_quantiles
        self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, n_quantiles)
        self.n_quantiles = n_quantiles
        self.kappa = kappa

        # bandit top-down controller
        self.TDC = ExpWeights(arms=[-1, 0],
                              lr=bandit_lr,
                              init=0.0,
                              use_std=True)

        # init optimizers
        self.q_optimizer = torch.optim.Adam(self.q_funcs.parameters(), lr=lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=lr)

        self.replay_pool = ReplayPool(capacity=int(buffer_size))

        self._update_counter = 0
예제 #6
0
 def reallocate_replay_pool(self, new_size: int):
     assert new_size != self.replay_pool.capacity, "Error, you've tried to allocate a new pool which has the same length"
     new_replay_pool = ReplayPool(capacity=new_size)
     new_replay_pool.initialise(self.replay_pool)
     self.replay_pool = new_replay_pool