Exemplo n.º 1
0
    def __init__(self, env, test_env, log_dir, num_steps=5 * (10 ** 7),
                 batch_size=32, N=32, num_cosines=64, ent_coef=0,
                 kappa=1.0, quantile_lr=5e-5, fraction_lr=2.5e-9,
                 memory_size=10 ** 6, gamma=0.99, multi_step=1,
                 update_interval=4, target_update_interval=10000,
                 start_steps=50000, epsilon_train=0.01, epsilon_eval=0.001,
                 epsilon_decay_steps=250000, double_q_learning=False,
                 dueling_net=False, noisy_net=False, use_per=False,
                 log_interval=100, eval_interval=250000, num_eval_steps=125000,
                 max_episode_steps=27000, grad_cliping=None, cuda=True,
                 seed=0):
        super(FQFAgent, self).__init__(
            env, test_env, log_dir, num_steps, batch_size, memory_size,
            gamma, multi_step, update_interval, target_update_interval,
            start_steps, epsilon_train, epsilon_eval, epsilon_decay_steps,
            double_q_learning, dueling_net, noisy_net, use_per, log_interval,
            eval_interval, num_eval_steps, max_episode_steps, grad_cliping,
            cuda, seed)

        # Online network.
        self.online_net = FQF(
            num_channels=env.observation_space.shape[0],
            num_actions=self.num_actions, N=N,
            num_cosines=num_cosines, dueling_net=dueling_net,
            noisy_net=noisy_net).to(self.device)
        # Target network.
        self.target_net = FQF(
            num_channels=env.observation_space.shape[0],
            num_actions=self.num_actions, N=N,
            num_cosines=num_cosines, dueling_net=dueling_net,
            noisy_net=noisy_net, target=True).to(self.device)

        # Copy parameters of the learning network to the target network.
        self.update_target()
        # Disable calculations of gradients of the target network.
        disable_gradients(self.target_net)

        self.fraction_optim = RMSprop(
            self.online_net.fraction_net.parameters(),
            lr=fraction_lr, alpha=0.95, eps=0.00001)

        self.quantile_optim = Adam(
            list(self.online_net.dqn_net.parameters())
            + list(self.online_net.cosine_net.parameters())
            + list(self.online_net.quantile_net.parameters()),
            lr=quantile_lr, eps=1e-2 / batch_size)

        # NOTE: The author said the training of Fraction Proposal Net is
        # unstable and value distribution degenerates into a deterministic
        # one rarely (e.g. 1 out of 20 seeds). So you can use entropy of value
        # distribution as a regularizer to stabilize (but possibly slow down)
        # training.
        self.ent_coef = ent_coef
        self.N = N
        self.num_cosines = num_cosines
        self.kappa = kappa
Exemplo n.º 2
0
    def __init__(self, env, test_env, log_dir, num_steps=5 * (10 ** 7),
                 batch_size=32, N=200, kappa=1.0, lr=5e-5, memory_size=10 ** 6,
                 gamma=0.99, multi_step=1, update_interval=4,
                 target_update_interval=10000, start_steps=50000,
                 epsilon_train=0.01, epsilon_eval=0.001,
                 epsilon_decay_steps=250000, double_q_learning=False,
                 dueling_net=False, noisy_net=False, use_per=False,
                 log_interval=100, eval_interval=250000, num_eval_steps=125000,
                 max_episode_steps=27000, grad_cliping=None, cuda=True,
                 seed=0):
        super(QRDQNAgent, self).__init__(
            env, test_env, log_dir, num_steps, batch_size, memory_size,
            gamma, multi_step, update_interval, target_update_interval,
            start_steps, epsilon_train, epsilon_eval, epsilon_decay_steps,
            double_q_learning, dueling_net, noisy_net, use_per, log_interval,
            eval_interval, num_eval_steps, max_episode_steps, grad_cliping,
            cuda, seed)

        # Online network.
        self.online_net = QRDQN(
            num_channels=env.observation_space.shape[0],
            num_actions=self.num_actions, N=N, dueling_net=dueling_net,
            noisy_net=noisy_net).to(self.device)

        # Target network.
        self.target_net = QRDQN(
            num_channels=env.observation_space.shape[0],
            num_actions=self.num_actions, N=N, dueling_net=dueling_net,
            noisy_net=noisy_net).to(self.device).to(self.device)

        # Copy parameters of the learning network to the target network.
        self.update_target()
        # Disable calculations of gradients of the target network.
        disable_gradients(self.target_net)

        self.optim = Adam(
            self.online_net.parameters(),
            lr=lr, eps=1e-2 / batch_size)

        # Fixed fractions.
        taus = torch.arange(
            0, N + 1, device=self.device, dtype=torch.float32) / N
        self.tau_hats = ((taus[1:] + taus[:-1]) / 2.0).view(1, N)

        self.N = N
        self.kappa = kappa
Exemplo n.º 3
0
    def __init__(self,
                 env,
                 test_env,
                 log_dir,
                 num_steps=5 * (10**7),
                 batch_size=32,
                 N=64,
                 N_dash=64,
                 K=32,
                 num_cosines=64,
                 kappa=1.0,
                 lr=5e-5,
                 memory_size=10**6,
                 gamma=0.99,
                 multi_step=1,
                 update_interval=4,
                 target_update_interval=10000,
                 start_steps=50000,
                 epsilon_train=0.01,
                 epsilon_eval=0.001,
                 epsilon_decay_steps=250000,
                 double_q_learning=False,
                 dueling_net=False,
                 noisy_net=False,
                 use_per=False,
                 log_interval=100,
                 eval_interval=250000,
                 num_eval_steps=125000,
                 max_episode_steps=27000,
                 grad_cliping=None,
                 cuda=True,
                 seed=0):
        super(IQNAgent,
              self).__init__(env, test_env, log_dir, num_steps, batch_size,
                             memory_size, gamma, multi_step, update_interval,
                             target_update_interval, start_steps,
                             epsilon_train, epsilon_eval, epsilon_decay_steps,
                             double_q_learning, dueling_net, noisy_net,
                             use_per, log_interval, eval_interval,
                             num_eval_steps, max_episode_steps, grad_cliping,
                             cuda, seed)

        # Online network.
        self.online_net = IQN(num_channels=env.observation_space.shape[0],
                              num_actions=self.num_actions,
                              K=K,
                              num_cosines=num_cosines,
                              dueling_net=dueling_net,
                              noisy_net=noisy_net).to(self.device)
        # Target network.
        self.target_net = IQN(num_channels=env.observation_space.shape[0],
                              num_actions=self.num_actions,
                              K=K,
                              num_cosines=num_cosines,
                              dueling_net=dueling_net,
                              noisy_net=noisy_net).to(self.device)

        # Copy parameters of the learning network to the target network.
        self.update_target()
        # Disable calculations of gradients of the target network.
        disable_gradients(self.target_net)

        self.optim = Adam(self.online_net.parameters(),
                          lr=lr,
                          eps=1e-2 / batch_size)

        self.N = N
        self.N_dash = N_dash
        self.K = K
        self.num_cosines = num_cosines
        self.kappa = kappa