예제 #1
0
파일: trainer.py 프로젝트: cbschaff/rsa
    def __init__(self,
                 logdir,
                 model,
                 opt,
                 datafile,
                 batch_size,
                 num_workers,
                 gpu=True):
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.data = DemonstrationData(datafile)
        self.sampler = StatefulSampler(self.data, shuffle=True)
        self.dtrain = DataLoader(self.data,
                                 sampler=self.sampler,
                                 batch_size=batch_size,
                                 num_workers=num_workers)
        self._diter = None
        self.t = 0
        self.epochs = 0
        self.batch_size = batch_size

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')
        self.model = model
        self.model.to(self.device)
        self.opt = opt(self.model.parameters())
예제 #2
0
파일: trainer.py 프로젝트: takuma-ynd/dl
    def __init__(self, logdir, model, opt, batch_size, num_workers, gpu=True):
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])
        self.data_train = datasets.MNIST('./data_train',
                                         download=True,
                                         transform=self.transform)
        self.data_test = datasets.MNIST('./data_test',
                                        download=True,
                                        train=False,
                                        transform=self.transform)
        self.sampler = StatefulSampler(self.data_train, shuffle=True)
        self.dtrain = DataLoader(self.data_train,
                                 sampler=self.sampler,
                                 batch_size=batch_size,
                                 num_workers=num_workers)
        self.dtest = DataLoader(self.data_test,
                                batch_size=batch_size,
                                num_workers=num_workers)
        self._diter = None
        self.t = 0
        self.epochs = 0
        self.batch_size = batch_size

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')
        self.model = model
        self.model.to(self.device)
        self.opt = opt(self.model.parameters())
예제 #3
0
파일: alpha_zero.py 프로젝트: takuma-ynd/dl
    def __init__(self,
                 logdir,
                 game,
                 policy,
                 optimizer=torch.optim.Adam,
                 n_simulations=100,
                 buffer_size=200,
                 batch_size=64,
                 batches_per_game=1,
                 gpu=True):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.game = game
        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')

        self.game = game
        self.n_sims = n_simulations
        self.batch_size = batch_size
        self.batches_per_game = batches_per_game

        self.pi = policy.to(self.device)
        self.opt = optimizer(self.pi.parameters(), lr=1e-2, weight_decay=1e-4)

        self.buffer = GameReplay(buffer_size)
        self.data_manager = SelfPlayManager(self.pi, self.game, self.buffer,
                                            self.device)

        self.mse = nn.MSELoss()

        self.t = 0
예제 #4
0
 def __init__(self, env, logdir, device):
     self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
     if not torch.cuda.is_available():
         device = 'cpu'
     self.device = device
     self.net = BCNet()
     self.net.to(device)
     self.net.load_state_dict(self.ckptr.load()['model'])
예제 #5
0
 def __init__(self, env, logdir, device):
     self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
     if not torch.cuda.is_available():
         device = 'cpu'
     self.pi = drone_ppo_policy_fn(env)
     self.pi.to(device)
     self.pi.load_state_dict(self.ckptr.load()['pi'])
     self.pi.eval()
     self.device = device
예제 #6
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 batch_size=32,
                 rollout_length=None,
                 gamma=0.99,
                 lambda_=0.95,
                 norm_advantages=False,
                 epochs_per_rollout=10,
                 max_grad_norm=None,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 clip_param=0.2,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 gpu=True):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.epochs_per_rollout = epochs_per_rollout
        self.max_grad_norm = max_grad_norm
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.clip_param = clip_param
        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.env = VecEpisodeLogger(VecRewardNormWrapper(env_fn(nenv=nenv),
                                                         gamma))

        self.pi = policy_fn(self.env).to(self.device)
        self.opt = optimizer(self.pi.parameters())
        self.data_manager = RolloutDataManager(
            self.env,
            PPOActor(self.pi),
            self.device,
            batch_size=batch_size,
            rollout_length=rollout_length,
            gamma=gamma,
            lambda_=lambda_,
            norm_advantages=norm_advantages)

        self.mse = nn.MSELoss(reduction='none')

        self.t = 0
예제 #7
0
    def __init__(self, env, logdir, device, switch_prob=0.001):
        dirs = [x for x in os.listdir(logdir) if os.path.isdir(
                                            os.path.join(logdir, x, 'ckpts'))]

        self.ckptrs = [Checkpointer(os.path.join(logdir, x, 'ckpts'))
                       for x in dirs]
        if not torch.cuda.is_available():
            device = 'cpu'
        self.device = device
        self.nets = [BCNet() for _ in dirs]
        for net, ckptr in zip(self.nets, self.ckptrs):
            net.to(device)
            net.load_state_dict(ckptr.load()['model'])
        self.current_actor = np.random.choice(self.nets)
        self.switch_prob = switch_prob
예제 #8
0
    def __init__(
            self,
            logdir,
            env_fn,
            policy_fn,
            nenv=1,
            optimizer=torch.optim.Adam,
            lambda_lr=1e-4,
            lambda_init=100.,
            lr_decay_rate=1. / 3.16227766017,
            lr_decay_freq=20000000,
            l2_reg=True,
            reward_threshold=-0.05,
            rollout_length=128,
            batch_size=32,
            gamma=0.99,
            lambda_=0.95,
            norm_advantages=False,
            epochs_per_rollout=10,
            max_grad_norm=None,
            ent_coef=0.01,
            vf_coef=0.5,
            clip_param=0.2,
            base_actor_cls=None,
            policy_training_start=10000,
            lambda_training_start=100000,
            eval_num_episodes=1,
            record_num_episodes=1,
            wrapper_fn=None,  # additional wrappers for the env
            gpu=True):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.epochs_per_rollout = epochs_per_rollout
        self.max_grad_norm = max_grad_norm
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.clip_param = clip_param
        self.base_actor_cls = base_actor_cls
        self.policy_training_start = policy_training_start
        self.lambda_training_start = lambda_training_start
        self.lambda_lr = lambda_lr
        self.lr_decay_rate = lr_decay_rate
        self.lr_decay_freq = lr_decay_freq
        self.l2_reg = l2_reg
        self.reward_threshold = reward_threshold
        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        self.env = ResidualWrapper(self.env, self.base_actor_cls(self.env))
        if wrapper_fn:
            self.env = wrapper_fn(self.env)

        self.pi = policy_fn(self.env).to(self.device)
        self.opt = optimizer(self.pi.parameters())
        self.pi_lr = self.opt.param_groups[0]['lr']
        if lambda_init < 10:
            lambda_init = np.log(np.exp(lambda_init) - 1)
        self.log_lambda_ = nn.Parameter(
            torch.Tensor([lambda_init]).to(self.device))
        self.opt_l = optimizer([self.log_lambda_], lr=lambda_lr)
        self._actor = ResidualPPOActor(self.pi, policy_training_start)
        self.data_manager = RolloutDataManager(self.env,
                                               self._actor,
                                               self.device,
                                               rollout_length=rollout_length,
                                               batch_size=batch_size,
                                               gamma=gamma,
                                               lambda_=lambda_,
                                               norm_advantages=norm_advantages)

        self.mse = nn.MSELoss(reduction='none')
        self.huber = nn.SmoothL1Loss()

        self.t = 0
예제 #9
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=10000,
                 frame_stack=1,
                 learning_starts=1000,
                 update_period=1,
                 batch_size=256,
                 policy_lr=1e-3,
                 qf_lr=1e-3,
                 gamma=0.99,
                 target_update_period=1,
                 policy_update_period=1,
                 target_smoothing_coef=0.005,
                 alpha=0.2,
                 automatic_entropy_tuning=True,
                 target_entropy=None,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.batch_size = batch_size
        if target_update_period < self.update_period:
            self.target_update_period = self.update_period
        else:
            self.target_update_period = target_update_period - (
                target_update_period % self.update_period)
        if policy_update_period < self.update_period:
            self.policy_update_period = self.update_period
        else:
            self.policy_update_period = policy_update_period - (
                policy_update_period % self.update_period)
        self.target_smoothing_coef = target_smoothing_coef
        self.log_period = log_period

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf1 = qf_fn(eval_env)
        self.qf2 = qf_fn(eval_env)
        self.target_qf1 = qf_fn(eval_env)
        self.target_qf2 = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf1.to(self.device)
        self.qf2.to(self.device)
        self.target_qf1.to(self.device)
        self.target_qf2.to(self.device)

        self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr)
        self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr)
        self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr)

        self.target_qf1.load_state_dict(self.qf1.state_dict())
        self.target_qf2.load_state_dict(self.qf2.state_dict())

        self.buffer = BatchedReplayBuffer(
            *
            [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)])
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    SACActor(self.pi),
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.alpha = alpha
        self.automatic_entropy_tuning = automatic_entropy_tuning
        if self.automatic_entropy_tuning:
            if target_entropy:
                self.target_entropy = target_entropy
            else:
                target_entropies = nest.map_structure(
                    lambda space: -np.prod(space.shape).item(),
                    misc.unpack_space(self.env.action_space))
                self.target_entropy = sum(nest.flatten(target_entropies))

            self.log_alpha = torch.tensor(np.log([self.alpha]),
                                          requires_grad=True,
                                          device=self.device,
                                          dtype=torch.float32)
            self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr)
        else:
            self.target_entropy = None
            self.log_alpha = None
            self.opt_alpha = None

        self.mse_loss = torch.nn.MSELoss()

        self.t = 0
예제 #10
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 value_fn,
                 nenv=1,
                 opt_pi=torch.optim.Adam,
                 opt_vf=torch.optim.Adam,
                 batch_size=32,
                 rollout_length=None,
                 gamma=0.99,
                 lambda_=0.95,
                 ent_coef=0.01,
                 norm_advantages=False,
                 epochs_pi=10,
                 epochs_vf=10,
                 max_grad_norm=None,
                 kl_target=0.01,
                 alpha=1.5,
                 policy_training_start=10000,
                 eval_num_episodes=10,
                 record_num_episodes=0,
                 gpu=True):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.ent_coef = ent_coef
        self.epochs_pi = epochs_pi
        self.epochs_vf = epochs_vf
        self.max_grad_norm = max_grad_norm
        self.kl_target = kl_target
        self.initial_kl_weight = 0.2
        self.kl_weight = self.initial_kl_weight
        self.alpha = alpha
        self.policy_training_start = policy_training_start
        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.env = VecEpisodeLogger(VecRewardNormWrapper(env_fn(nenv=nenv),
                                                         gamma))

        self.pi = policy_fn(self.env).to(self.device)
        self.vf = value_fn(self.env).to(self.device)
        self.opt_pi = opt_pi(self.pi.parameters())
        self.opt_vf = opt_vf(self.vf.parameters())
        self._actor = ResidualPPOActor(self.pi, self.vf, policy_training_start)
        self.data_manager = RolloutDataManager(
            self.env,
            self._actor,
            self.device,
            batch_size=batch_size,
            rollout_length=rollout_length,
            gamma=gamma,
            lambda_=lambda_,
            norm_advantages=norm_advantages)

        self.mse = nn.MSELoss()

        self.t = 0
예제 #11
0
    def __init__(self,
                 logdir,
                 env_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.RMSprop,
                 buffer_size=100000,
                 frame_stack=1,
                 learning_starts=10000,
                 update_period=1,
                 gamma=0.99,
                 huber_loss=True,
                 exploration_timesteps=1000000,
                 final_eps=0.1,
                 eval_eps=0.05,
                 target_update_period=10000,
                 batch_size=32,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=10):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.frame_stack = frame_stack
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.eval_eps = eval_eps
        self.target_update_period = target_update_period - (
            target_update_period % self.update_period)
        self.log_period = log_period
        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        stacked_env = VecFrameStack(env_fn(nenv=nenv), self.frame_stack)

        self.qf = qf_fn(stacked_env).to(self.device)
        self.qf_targ = qf_fn(stacked_env).to(self.device)
        self.opt = optimizer(self.qf.parameters())
        if huber_loss:
            self.criterion = torch.nn.SmoothL1Loss(reduction='none')
        else:
            self.criterion = torch.nn.MSELoss(reduction='none')
        self.eps_schedule = LinearSchedule(exploration_timesteps, final_eps,
                                           1.0)
        self._actor = EpsilonGreedyActor(self.qf, self.eps_schedule,
                                         self.env.action_space)

        self.buffer = ReplayBuffer(self.buffer_size, self.frame_stack)
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    self._actor, self.device,
                                                    self.learning_starts,
                                                    self.update_period)
        self.t = 0
예제 #12
0
파일: ddpg.py 프로젝트: amackeith/dl
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=10000,
                 frame_stack=1,
                 learning_starts=1000,
                 update_period=1,
                 batch_size=256,
                 policy_lr=1e-4,
                 qf_lr=1e-3,
                 qf_weight_decay=0.01,
                 gamma=0.99,
                 noise_theta=0.15,
                 noise_sigma=0.2,
                 noise_sigma_final=0.01,
                 noise_decay_period=10000,
                 target_update_period=1,
                 target_smoothing_coef=0.005,
                 reward_scale=1,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.batch_size = batch_size
        if target_update_period < self.update_period:
            self.target_update_period = self.update_period
        else:
            self.target_update_period = target_update_period - (
                target_update_period % self.update_period)
        self.reward_scale = reward_scale
        self.target_smoothing_coef = target_smoothing_coef
        self.log_period = log_period

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')
        self.t = 0

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        self.policy_fn = policy_fn
        self.qf_fn = qf_fn
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf = qf_fn(eval_env)
        self.target_pi = policy_fn(eval_env)
        self.target_qf = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf.to(self.device)
        self.target_pi.to(self.device)
        self.target_qf.to(self.device)

        self.optimizer = optimizer
        self.policy_lr = policy_lr
        self.qf_lr = qf_lr
        self.qf_weight_decay = qf_weight_decay
        self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr)
        self.opt_qf = optimizer(self.qf.parameters(),
                                lr=qf_lr,
                                weight_decay=qf_weight_decay)

        self.target_pi.load_state_dict(self.pi.state_dict())
        self.target_qf.load_state_dict(self.qf.state_dict())

        self.noise_schedule = LinearSchedule(noise_decay_period,
                                             noise_sigma_final, noise_sigma)
        self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta,
                                self.noise_schedule.value(self.t))
        self.buffer = ReplayBuffer(buffer_size, frame_stack)
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    self._actor, self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.qf_criterion = torch.nn.MSELoss()
        if self.env.action_space.__class__.__name__ == 'Discrete':
            raise ValueError("Action space must be continuous!")
예제 #13
0
파일: td3.py 프로젝트: amackeith/dl
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=int(1e6),
                 frame_stack=1,
                 learning_starts=10000,
                 update_period=1,
                 batch_size=256,
                 lr=3e-4,
                 policy_update_period=2,
                 target_smoothing_coef=0.005,
                 reward_scale=1,
                 gamma=0.99,
                 exploration_noise=0.1,
                 policy_noise=0.2,
                 policy_noise_clip=0.5,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        if policy_update_period < self.update_period:
            self.policy_update_period = self.update_period
        else:
            self.policy_update_period = policy_update_period - (
                policy_update_period % self.update_period)
        self.reward_scale = reward_scale
        self.target_smoothing_coef = target_smoothing_coef
        self.exploration_noise = exploration_noise
        self.policy_noise = policy_noise
        self.policy_noise_clip = policy_noise_clip
        self.log_period = log_period

        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')

        self.policy_fn = policy_fn
        self.qf_fn = qf_fn
        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf1 = qf_fn(eval_env)
        self.qf2 = qf_fn(eval_env)
        self.target_pi = policy_fn(eval_env)
        self.target_qf1 = qf_fn(eval_env)
        self.target_qf2 = qf_fn(eval_env)

        self.pi.to(self.device)
        self.qf1.to(self.device)
        self.qf2.to(self.device)
        self.target_pi.to(self.device)
        self.target_qf1.to(self.device)
        self.target_qf2.to(self.device)

        self.optimizer = optimizer
        self.lr = lr
        self.opt_pi = optimizer(self.pi.parameters(), lr=lr)
        self.opt_qf = optimizer(list(self.qf1.parameters()) +
                                list(self.qf2.parameters()),
                                lr=lr)

        self.target_pi.load_state_dict(self.pi.state_dict())
        self.target_qf1.load_state_dict(self.qf1.state_dict())
        self.target_qf2.load_state_dict(self.qf2.state_dict())

        self._actor = TD3Actor(self.pi, self.env.action_space,
                               exploration_noise)
        self.buffer = ReplayBuffer(buffer_size, frame_stack)
        self.data_manager = ReplayBufferDataManager(self.buffer, self.env,
                                                    self._actor, self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.qf_criterion = torch.nn.MSELoss()
        if self.env.action_space.__class__.__name__ == 'Discrete':
            raise ValueError("Action space must be continuous!")

        self.low = torch.from_numpy(self.env.action_space.low).to(self.device)
        self.high = torch.from_numpy(self.env.action_space.high).to(
            self.device)

        self.t = 0
예제 #14
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 value_fn,
                 rnd_net,
                 nenv=1,
                 opt_pi=torch.optim.Adam,
                 opt_vf=torch.optim.Adam,
                 opt_rnd=torch.optim.Adam,
                 batch_size=32,
                 rollout_length=128,
                 gamma_ext=0.999,
                 gamma_int=0.99,
                 lambda_=0.95,
                 ent_coef=0.01,
                 rnd_coef=0.5,
                 rnd_subsample_rate=4,
                 norm_advantages=False,
                 epochs_pi=10,
                 epochs_vf=10,
                 max_grad_norm=None,
                 kl_target=0.01,
                 alpha=1.5,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 gpu=True):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.ent_coef = ent_coef
        self.rnd_coef = rnd_coef
        self.rnd_subsample_rate = rnd_subsample_rate
        self.rnd_update_count = 0
        self.epochs_pi = epochs_pi
        self.epochs_vf = epochs_vf
        self.max_grad_norm = max_grad_norm
        self.norm_advantages = norm_advantages
        self.kl_target = kl_target
        self.initial_kl_weight = 0.2
        self.kl_weight = self.initial_kl_weight
        self.alpha = alpha
        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        self.rnd = RND(rnd_net, opt_rnd, gamma_int,
                       self.env.observation_space.shape, self.device)
        self.env = RNDVecEnv(self.env, self.rnd)

        self.pi = policy_fn(self.env).to(self.device)
        self.vf = value_fn(self.env).to(self.device)
        self.opt_pi = opt_pi(self.pi.parameters())
        self.opt_vf = opt_vf(self.vf.parameters())

        self.gamma = torch.Tensor([gamma_ext, gamma_int]).to(self.device)
        self.data_manager = RolloutDataManager(
            self.env,
            PPOActor(self.pi, self.vf),
            self.device,
            batch_size=batch_size,
            rollout_length=rollout_length,
            gamma=self.gamma,
            lambda_=lambda_,
            norm_advantages=False)

        self.mse = nn.MSELoss()

        self.t = 0
예제 #15
0
파일: sac.py 프로젝트: takuma-ynd/dl
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 qf_fn,
                 vf_fn,
                 nenv=1,
                 optimizer=torch.optim.Adam,
                 buffer_size=10000,
                 frame_stack=1,
                 learning_starts=1000,
                 update_period=1,
                 batch_size=256,
                 policy_lr=1e-3,
                 qf_lr=1e-3,
                 vf_lr=1e-3,
                 policy_mean_reg_weight=1e-3,
                 gamma=0.99,
                 target_update_period=1,
                 policy_update_period=1,
                 target_smoothing_coef=0.005,
                 automatic_entropy_tuning=True,
                 reparameterization_trick=True,
                 target_entropy=None,
                 reward_scale=1,
                 gpu=True,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 log_period=1000):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.frame_stack = frame_stack
        self.learning_starts = learning_starts
        self.update_period = update_period
        self.batch_size = batch_size
        if target_update_period < self.update_period:
            self.target_update_period = self.update_period
        else:
            self.target_update_period = target_update_period - (
                                target_update_period % self.update_period)
        if policy_update_period < self.update_period:
            self.policy_update_period = self.update_period
        else:
            self.policy_update_period = policy_update_period - (
                                policy_update_period % self.update_period)
        self.rsample = reparameterization_trick
        self.reward_scale = reward_scale
        self.target_smoothing_coef = target_smoothing_coef
        self.log_period = log_period

        self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
                                   else 'cpu')

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        eval_env = VecFrameStack(self.env, self.frame_stack)
        self.pi = policy_fn(eval_env)
        self.qf1 = qf_fn(eval_env)
        self.qf2 = qf_fn(eval_env)
        self.vf = vf_fn(eval_env)
        self.target_vf = vf_fn(eval_env)

        self.pi.to(self.device)
        self.qf1.to(self.device)
        self.qf2.to(self.device)
        self.vf.to(self.device)
        self.target_vf.to(self.device)

        self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr)
        self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr)
        self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr)
        self.opt_vf = optimizer(self.vf.parameters(), lr=vf_lr)
        self.policy_mean_reg_weight = policy_mean_reg_weight

        self.target_vf.load_state_dict(self.vf.state_dict())

        self.buffer = ReplayBuffer(buffer_size, frame_stack)
        self.data_manager = ReplayBufferDataManager(self.buffer,
                                                    self.env,
                                                    SACActor(self.pi),
                                                    self.device,
                                                    self.learning_starts,
                                                    self.update_period)

        self.discrete = self.env.action_space.__class__.__name__ == 'Discrete'
        self.automatic_entropy_tuning = automatic_entropy_tuning
        if self.automatic_entropy_tuning:
            if target_entropy:
                self.target_entropy = target_entropy
            else:
                # heuristic value from Tuomas
                if self.discrete:
                    self.target_entropy = np.log(1.5)
                else:
                    self.target_entropy = -np.prod(
                        self.env.action_space.shape).item()
            self.log_alpha = torch.zeros(1, requires_grad=True,
                                         device=self.device)
            self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr)
        else:
            self.target_entropy = None
            self.log_alpha = None
            self.opt_alpha = None

        self.qf_criterion = torch.nn.MSELoss()
        self.vf_criterion = torch.nn.MSELoss()

        self.t = 0
예제 #16
0
    def __init__(self,
                 logdir,
                 env_fn,
                 policy_fn,
                 value_fn,
                 rnd_net,
                 ide_embedding_net,
                 ide_prediction_net,
                 ide_loss,
                 nenv=1,
                 opt_pi=torch.optim.Adam,
                 opt_vf=torch.optim.Adam,
                 opt_rnd=torch.optim.Adam,
                 opt_ide=torch.optim.Adam,
                 batch_size=32,
                 rollout_length=128,
                 gamma_ext=0.999,
                 gamma_int=0.99,
                 lambda_=0.95,
                 ent_coef=0.01,
                 ngu_coef=0.5,
                 ngu_buffer_capacity=1024,
                 ngu_subsample_freq=32,
                 ngu_updates=4,
                 ngu_batch_size=64,
                 policy_training_starts=500000,
                 buffer_size=100000,
                 norm_advantages=False,
                 epochs_pi=10,
                 epochs_vf=10,
                 max_grad_norm=None,
                 kl_target=0.01,
                 alpha=1.5,
                 eval_num_episodes=1,
                 record_num_episodes=1,
                 gpu=True):
        """Init."""
        self.logdir = logdir
        self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
        self.env_fn = env_fn
        self.nenv = nenv
        self.eval_num_episodes = eval_num_episodes
        self.record_num_episodes = record_num_episodes
        self.ent_coef = ent_coef
        self.ngu_coef = ngu_coef
        self.ngu_updates = ngu_updates
        self.ngu_batch_size = ngu_batch_size
        self.ngu_subsample_freq = ngu_subsample_freq
        self.epochs_pi = epochs_pi
        self.epochs_vf = epochs_vf
        self.max_grad_norm = max_grad_norm
        self.norm_advantages = norm_advantages
        self.kl_target = kl_target
        self.initial_kl_weight = 0.2
        self.kl_weight = self.initial_kl_weight
        self.policy_training_starts = policy_training_starts
        self.alpha = alpha
        self.device = torch.device(
            'cuda:0' if gpu and torch.cuda.is_available() else 'cpu')
        # self.ngu_coefs = (1 + np.tanh(np.arange(-4, 4, 8/nenv))) / 2. * ngu_coef
        # self.ngu_coefs = torch.from_numpy(self.ngu_coefs).to(self.device)

        self.env = VecEpisodeLogger(env_fn(nenv=nenv))
        self.rnd = RND(rnd_net, opt_rnd, gamma_int,
                       self.env.observation_space.shape, self.device)
        self.ide = InverseDynamicsEmbedding(self.env, ide_embedding_net,
                                            ide_prediction_net, ide_loss,
                                            opt_ide, self.device)
        self.ngu = NGU(self.rnd,
                       self.ide,
                       ngu_buffer_capacity,
                       self.device,
                       gamma=gamma_int)
        self.env = VecActionRewardInObWrapper(NGUVecEnv(self.env, self.ngu),
                                              reward_shape=(2, ))

        self.pi = policy_fn(self.env).to(self.device)
        self.vf = value_fn(self.env).to(self.device)
        self.opt_pi = opt_pi(self.pi.parameters())
        self.opt_vf = opt_vf(self.vf.parameters())

        self.gamma = torch.Tensor([gamma_ext, gamma_int]).to(self.device)
        self.data_manager = RolloutDataManager(self.env,
                                               PPOActor(self.pi, self.vf),
                                               self.device,
                                               batch_size=batch_size,
                                               rollout_length=rollout_length,
                                               gamma=self.gamma,
                                               lambda_=lambda_,
                                               norm_advantages=False)
        self.buffer_size = buffer_size
        self.buffer = ReplayBuffer(buffer_size, 1)

        self.mse = nn.MSELoss()

        self.t = 0