def __init__(self, logdir, model, opt, datafile, batch_size, num_workers, gpu=True): self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.data = DemonstrationData(datafile) self.sampler = StatefulSampler(self.data, shuffle=True) self.dtrain = DataLoader(self.data, sampler=self.sampler, batch_size=batch_size, num_workers=num_workers) self._diter = None self.t = 0 self.epochs = 0 self.batch_size = batch_size self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.model = model self.model.to(self.device) self.opt = opt(self.model.parameters())
def __init__(self, logdir, model, opt, batch_size, num_workers, gpu=True): self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) self.data_train = datasets.MNIST('./data_train', download=True, transform=self.transform) self.data_test = datasets.MNIST('./data_test', download=True, train=False, transform=self.transform) self.sampler = StatefulSampler(self.data_train, shuffle=True) self.dtrain = DataLoader(self.data_train, sampler=self.sampler, batch_size=batch_size, num_workers=num_workers) self.dtest = DataLoader(self.data_test, batch_size=batch_size, num_workers=num_workers) self._diter = None self.t = 0 self.epochs = 0 self.batch_size = batch_size self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.model = model self.model.to(self.device) self.opt = opt(self.model.parameters())
def __init__(self, logdir, game, policy, optimizer=torch.optim.Adam, n_simulations=100, buffer_size=200, batch_size=64, batches_per_game=1, gpu=True): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.game = game self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.game = game self.n_sims = n_simulations self.batch_size = batch_size self.batches_per_game = batches_per_game self.pi = policy.to(self.device) self.opt = optimizer(self.pi.parameters(), lr=1e-2, weight_decay=1e-4) self.buffer = GameReplay(buffer_size) self.data_manager = SelfPlayManager(self.pi, self.game, self.buffer, self.device) self.mse = nn.MSELoss() self.t = 0
def __init__(self, env, logdir, device): self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) if not torch.cuda.is_available(): device = 'cpu' self.device = device self.net = BCNet() self.net.to(device) self.net.load_state_dict(self.ckptr.load()['model'])
def __init__(self, env, logdir, device): self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) if not torch.cuda.is_available(): device = 'cpu' self.pi = drone_ppo_policy_fn(env) self.pi.to(device) self.pi.load_state_dict(self.ckptr.load()['pi']) self.pi.eval() self.device = device
def __init__(self, logdir, env_fn, policy_fn, nenv=1, optimizer=torch.optim.Adam, batch_size=32, rollout_length=None, gamma=0.99, lambda_=0.95, norm_advantages=False, epochs_per_rollout=10, max_grad_norm=None, ent_coef=0.01, vf_coef=0.5, clip_param=0.2, eval_num_episodes=1, record_num_episodes=1, gpu=True): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.epochs_per_rollout = epochs_per_rollout self.max_grad_norm = max_grad_norm self.ent_coef = ent_coef self.vf_coef = vf_coef self.clip_param = clip_param self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(VecRewardNormWrapper(env_fn(nenv=nenv), gamma)) self.pi = policy_fn(self.env).to(self.device) self.opt = optimizer(self.pi.parameters()) self.data_manager = RolloutDataManager( self.env, PPOActor(self.pi), self.device, batch_size=batch_size, rollout_length=rollout_length, gamma=gamma, lambda_=lambda_, norm_advantages=norm_advantages) self.mse = nn.MSELoss(reduction='none') self.t = 0
def __init__(self, env, logdir, device, switch_prob=0.001): dirs = [x for x in os.listdir(logdir) if os.path.isdir( os.path.join(logdir, x, 'ckpts'))] self.ckptrs = [Checkpointer(os.path.join(logdir, x, 'ckpts')) for x in dirs] if not torch.cuda.is_available(): device = 'cpu' self.device = device self.nets = [BCNet() for _ in dirs] for net, ckptr in zip(self.nets, self.ckptrs): net.to(device) net.load_state_dict(ckptr.load()['model']) self.current_actor = np.random.choice(self.nets) self.switch_prob = switch_prob
def __init__( self, logdir, env_fn, policy_fn, nenv=1, optimizer=torch.optim.Adam, lambda_lr=1e-4, lambda_init=100., lr_decay_rate=1. / 3.16227766017, lr_decay_freq=20000000, l2_reg=True, reward_threshold=-0.05, rollout_length=128, batch_size=32, gamma=0.99, lambda_=0.95, norm_advantages=False, epochs_per_rollout=10, max_grad_norm=None, ent_coef=0.01, vf_coef=0.5, clip_param=0.2, base_actor_cls=None, policy_training_start=10000, lambda_training_start=100000, eval_num_episodes=1, record_num_episodes=1, wrapper_fn=None, # additional wrappers for the env gpu=True): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.epochs_per_rollout = epochs_per_rollout self.max_grad_norm = max_grad_norm self.ent_coef = ent_coef self.vf_coef = vf_coef self.clip_param = clip_param self.base_actor_cls = base_actor_cls self.policy_training_start = policy_training_start self.lambda_training_start = lambda_training_start self.lambda_lr = lambda_lr self.lr_decay_rate = lr_decay_rate self.lr_decay_freq = lr_decay_freq self.l2_reg = l2_reg self.reward_threshold = reward_threshold self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.env = ResidualWrapper(self.env, self.base_actor_cls(self.env)) if wrapper_fn: self.env = wrapper_fn(self.env) self.pi = policy_fn(self.env).to(self.device) self.opt = optimizer(self.pi.parameters()) self.pi_lr = self.opt.param_groups[0]['lr'] if lambda_init < 10: lambda_init = np.log(np.exp(lambda_init) - 1) self.log_lambda_ = nn.Parameter( torch.Tensor([lambda_init]).to(self.device)) self.opt_l = optimizer([self.log_lambda_], lr=lambda_lr) self._actor = ResidualPPOActor(self.pi, policy_training_start) self.data_manager = RolloutDataManager(self.env, self._actor, self.device, rollout_length=rollout_length, batch_size=batch_size, gamma=gamma, lambda_=lambda_, norm_advantages=norm_advantages) self.mse = nn.MSELoss(reduction='none') self.huber = nn.SmoothL1Loss() self.t = 0
def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-3, qf_lr=1e-3, gamma=0.99, target_update_period=1, policy_update_period=1, target_smoothing_coef=0.005, alpha=0.2, automatic_entropy_tuning=True, target_entropy=None, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) if policy_update_period < self.update_period: self.policy_update_period = self.update_period else: self.policy_update_period = policy_update_period - ( policy_update_period % self.update_period) self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf1 = qf_fn(eval_env) self.qf2 = qf_fn(eval_env) self.target_qf1 = qf_fn(eval_env) self.target_qf2 = qf_fn(eval_env) self.pi.to(self.device) self.qf1.to(self.device) self.qf2.to(self.device) self.target_qf1.to(self.device) self.target_qf2.to(self.device) self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr) self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf2.load_state_dict(self.qf2.state_dict()) self.buffer = BatchedReplayBuffer( * [ReplayBuffer(buffer_size, frame_stack) for _ in range(self.nenv)]) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, SACActor(self.pi), self.device, self.learning_starts, self.update_period) self.alpha = alpha self.automatic_entropy_tuning = automatic_entropy_tuning if self.automatic_entropy_tuning: if target_entropy: self.target_entropy = target_entropy else: target_entropies = nest.map_structure( lambda space: -np.prod(space.shape).item(), misc.unpack_space(self.env.action_space)) self.target_entropy = sum(nest.flatten(target_entropies)) self.log_alpha = torch.tensor(np.log([self.alpha]), requires_grad=True, device=self.device, dtype=torch.float32) self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr) else: self.target_entropy = None self.log_alpha = None self.opt_alpha = None self.mse_loss = torch.nn.MSELoss() self.t = 0
def __init__(self, logdir, env_fn, policy_fn, value_fn, nenv=1, opt_pi=torch.optim.Adam, opt_vf=torch.optim.Adam, batch_size=32, rollout_length=None, gamma=0.99, lambda_=0.95, ent_coef=0.01, norm_advantages=False, epochs_pi=10, epochs_vf=10, max_grad_norm=None, kl_target=0.01, alpha=1.5, policy_training_start=10000, eval_num_episodes=10, record_num_episodes=0, gpu=True): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.ent_coef = ent_coef self.epochs_pi = epochs_pi self.epochs_vf = epochs_vf self.max_grad_norm = max_grad_norm self.kl_target = kl_target self.initial_kl_weight = 0.2 self.kl_weight = self.initial_kl_weight self.alpha = alpha self.policy_training_start = policy_training_start self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(VecRewardNormWrapper(env_fn(nenv=nenv), gamma)) self.pi = policy_fn(self.env).to(self.device) self.vf = value_fn(self.env).to(self.device) self.opt_pi = opt_pi(self.pi.parameters()) self.opt_vf = opt_vf(self.vf.parameters()) self._actor = ResidualPPOActor(self.pi, self.vf, policy_training_start) self.data_manager = RolloutDataManager( self.env, self._actor, self.device, batch_size=batch_size, rollout_length=rollout_length, gamma=gamma, lambda_=lambda_, norm_advantages=norm_advantages) self.mse = nn.MSELoss() self.t = 0
def __init__(self, logdir, env_fn, qf_fn, nenv=1, optimizer=torch.optim.RMSprop, buffer_size=100000, frame_stack=1, learning_starts=10000, update_period=1, gamma=0.99, huber_loss=True, exploration_timesteps=1000000, final_eps=0.1, eval_eps=0.05, target_update_period=10000, batch_size=32, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=10): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.frame_stack = frame_stack self.buffer_size = buffer_size self.batch_size = batch_size self.learning_starts = learning_starts self.update_period = update_period self.eval_eps = eval_eps self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) stacked_env = VecFrameStack(env_fn(nenv=nenv), self.frame_stack) self.qf = qf_fn(stacked_env).to(self.device) self.qf_targ = qf_fn(stacked_env).to(self.device) self.opt = optimizer(self.qf.parameters()) if huber_loss: self.criterion = torch.nn.SmoothL1Loss(reduction='none') else: self.criterion = torch.nn.MSELoss(reduction='none') self.eps_schedule = LinearSchedule(exploration_timesteps, final_eps, 1.0) self._actor = EpsilonGreedyActor(self.qf, self.eps_schedule, self.env.action_space) self.buffer = ReplayBuffer(self.buffer_size, self.frame_stack) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.t = 0
def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-4, qf_lr=1e-3, qf_weight_decay=0.01, gamma=0.99, noise_theta=0.15, noise_sigma=0.2, noise_sigma_final=0.01, noise_decay_period=10000, target_update_period=1, target_smoothing_coef=0.005, reward_scale=1, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.t = 0 self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.policy_fn = policy_fn self.qf_fn = qf_fn eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf = qf_fn(eval_env) self.pi.to(self.device) self.qf.to(self.device) self.target_pi.to(self.device) self.target_qf.to(self.device) self.optimizer = optimizer self.policy_lr = policy_lr self.qf_lr = qf_lr self.qf_weight_decay = qf_weight_decay self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf = optimizer(self.qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf.load_state_dict(self.qf.state_dict()) self.noise_schedule = LinearSchedule(noise_decay_period, noise_sigma_final, noise_sigma) self._actor = DDPGActor(self.pi, self.env.action_space, noise_theta, self.noise_schedule.value(self.t)) self.buffer = ReplayBuffer(buffer_size, frame_stack) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!")
def __init__(self, logdir, env_fn, policy_fn, qf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=int(1e6), frame_stack=1, learning_starts=10000, update_period=1, batch_size=256, lr=3e-4, policy_update_period=2, target_smoothing_coef=0.005, reward_scale=1, gamma=0.99, exploration_noise=0.1, policy_noise=0.2, policy_noise_clip=0.5, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.batch_size = batch_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period if policy_update_period < self.update_period: self.policy_update_period = self.update_period else: self.policy_update_period = policy_update_period - ( policy_update_period % self.update_period) self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.exploration_noise = exploration_noise self.policy_noise = policy_noise self.policy_noise_clip = policy_noise_clip self.log_period = log_period self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.policy_fn = policy_fn self.qf_fn = qf_fn self.env = VecEpisodeLogger(env_fn(nenv=nenv)) eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf1 = qf_fn(eval_env) self.qf2 = qf_fn(eval_env) self.target_pi = policy_fn(eval_env) self.target_qf1 = qf_fn(eval_env) self.target_qf2 = qf_fn(eval_env) self.pi.to(self.device) self.qf1.to(self.device) self.qf2.to(self.device) self.target_pi.to(self.device) self.target_qf1.to(self.device) self.target_qf2.to(self.device) self.optimizer = optimizer self.lr = lr self.opt_pi = optimizer(self.pi.parameters(), lr=lr) self.opt_qf = optimizer(list(self.qf1.parameters()) + list(self.qf2.parameters()), lr=lr) self.target_pi.load_state_dict(self.pi.state_dict()) self.target_qf1.load_state_dict(self.qf1.state_dict()) self.target_qf2.load_state_dict(self.qf2.state_dict()) self._actor = TD3Actor(self.pi, self.env.action_space, exploration_noise) self.buffer = ReplayBuffer(buffer_size, frame_stack) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, self._actor, self.device, self.learning_starts, self.update_period) self.qf_criterion = torch.nn.MSELoss() if self.env.action_space.__class__.__name__ == 'Discrete': raise ValueError("Action space must be continuous!") self.low = torch.from_numpy(self.env.action_space.low).to(self.device) self.high = torch.from_numpy(self.env.action_space.high).to( self.device) self.t = 0
def __init__(self, logdir, env_fn, policy_fn, value_fn, rnd_net, nenv=1, opt_pi=torch.optim.Adam, opt_vf=torch.optim.Adam, opt_rnd=torch.optim.Adam, batch_size=32, rollout_length=128, gamma_ext=0.999, gamma_int=0.99, lambda_=0.95, ent_coef=0.01, rnd_coef=0.5, rnd_subsample_rate=4, norm_advantages=False, epochs_pi=10, epochs_vf=10, max_grad_norm=None, kl_target=0.01, alpha=1.5, eval_num_episodes=1, record_num_episodes=1, gpu=True): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.ent_coef = ent_coef self.rnd_coef = rnd_coef self.rnd_subsample_rate = rnd_subsample_rate self.rnd_update_count = 0 self.epochs_pi = epochs_pi self.epochs_vf = epochs_vf self.max_grad_norm = max_grad_norm self.norm_advantages = norm_advantages self.kl_target = kl_target self.initial_kl_weight = 0.2 self.kl_weight = self.initial_kl_weight self.alpha = alpha self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.rnd = RND(rnd_net, opt_rnd, gamma_int, self.env.observation_space.shape, self.device) self.env = RNDVecEnv(self.env, self.rnd) self.pi = policy_fn(self.env).to(self.device) self.vf = value_fn(self.env).to(self.device) self.opt_pi = opt_pi(self.pi.parameters()) self.opt_vf = opt_vf(self.vf.parameters()) self.gamma = torch.Tensor([gamma_ext, gamma_int]).to(self.device) self.data_manager = RolloutDataManager( self.env, PPOActor(self.pi, self.vf), self.device, batch_size=batch_size, rollout_length=rollout_length, gamma=self.gamma, lambda_=lambda_, norm_advantages=False) self.mse = nn.MSELoss() self.t = 0
def __init__(self, logdir, env_fn, policy_fn, qf_fn, vf_fn, nenv=1, optimizer=torch.optim.Adam, buffer_size=10000, frame_stack=1, learning_starts=1000, update_period=1, batch_size=256, policy_lr=1e-3, qf_lr=1e-3, vf_lr=1e-3, policy_mean_reg_weight=1e-3, gamma=0.99, target_update_period=1, policy_update_period=1, target_smoothing_coef=0.005, automatic_entropy_tuning=True, reparameterization_trick=True, target_entropy=None, reward_scale=1, gpu=True, eval_num_episodes=1, record_num_episodes=1, log_period=1000): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.gamma = gamma self.buffer_size = buffer_size self.frame_stack = frame_stack self.learning_starts = learning_starts self.update_period = update_period self.batch_size = batch_size if target_update_period < self.update_period: self.target_update_period = self.update_period else: self.target_update_period = target_update_period - ( target_update_period % self.update_period) if policy_update_period < self.update_period: self.policy_update_period = self.update_period else: self.policy_update_period = policy_update_period - ( policy_update_period % self.update_period) self.rsample = reparameterization_trick self.reward_scale = reward_scale self.target_smoothing_coef = target_smoothing_coef self.log_period = log_period self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() else 'cpu') self.env = VecEpisodeLogger(env_fn(nenv=nenv)) eval_env = VecFrameStack(self.env, self.frame_stack) self.pi = policy_fn(eval_env) self.qf1 = qf_fn(eval_env) self.qf2 = qf_fn(eval_env) self.vf = vf_fn(eval_env) self.target_vf = vf_fn(eval_env) self.pi.to(self.device) self.qf1.to(self.device) self.qf2.to(self.device) self.vf.to(self.device) self.target_vf.to(self.device) self.opt_pi = optimizer(self.pi.parameters(), lr=policy_lr) self.opt_qf1 = optimizer(self.qf1.parameters(), lr=qf_lr) self.opt_qf2 = optimizer(self.qf2.parameters(), lr=qf_lr) self.opt_vf = optimizer(self.vf.parameters(), lr=vf_lr) self.policy_mean_reg_weight = policy_mean_reg_weight self.target_vf.load_state_dict(self.vf.state_dict()) self.buffer = ReplayBuffer(buffer_size, frame_stack) self.data_manager = ReplayBufferDataManager(self.buffer, self.env, SACActor(self.pi), self.device, self.learning_starts, self.update_period) self.discrete = self.env.action_space.__class__.__name__ == 'Discrete' self.automatic_entropy_tuning = automatic_entropy_tuning if self.automatic_entropy_tuning: if target_entropy: self.target_entropy = target_entropy else: # heuristic value from Tuomas if self.discrete: self.target_entropy = np.log(1.5) else: self.target_entropy = -np.prod( self.env.action_space.shape).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.opt_alpha = optimizer([self.log_alpha], lr=policy_lr) else: self.target_entropy = None self.log_alpha = None self.opt_alpha = None self.qf_criterion = torch.nn.MSELoss() self.vf_criterion = torch.nn.MSELoss() self.t = 0
def __init__(self, logdir, env_fn, policy_fn, value_fn, rnd_net, ide_embedding_net, ide_prediction_net, ide_loss, nenv=1, opt_pi=torch.optim.Adam, opt_vf=torch.optim.Adam, opt_rnd=torch.optim.Adam, opt_ide=torch.optim.Adam, batch_size=32, rollout_length=128, gamma_ext=0.999, gamma_int=0.99, lambda_=0.95, ent_coef=0.01, ngu_coef=0.5, ngu_buffer_capacity=1024, ngu_subsample_freq=32, ngu_updates=4, ngu_batch_size=64, policy_training_starts=500000, buffer_size=100000, norm_advantages=False, epochs_pi=10, epochs_vf=10, max_grad_norm=None, kl_target=0.01, alpha=1.5, eval_num_episodes=1, record_num_episodes=1, gpu=True): """Init.""" self.logdir = logdir self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) self.env_fn = env_fn self.nenv = nenv self.eval_num_episodes = eval_num_episodes self.record_num_episodes = record_num_episodes self.ent_coef = ent_coef self.ngu_coef = ngu_coef self.ngu_updates = ngu_updates self.ngu_batch_size = ngu_batch_size self.ngu_subsample_freq = ngu_subsample_freq self.epochs_pi = epochs_pi self.epochs_vf = epochs_vf self.max_grad_norm = max_grad_norm self.norm_advantages = norm_advantages self.kl_target = kl_target self.initial_kl_weight = 0.2 self.kl_weight = self.initial_kl_weight self.policy_training_starts = policy_training_starts self.alpha = alpha self.device = torch.device( 'cuda:0' if gpu and torch.cuda.is_available() else 'cpu') # self.ngu_coefs = (1 + np.tanh(np.arange(-4, 4, 8/nenv))) / 2. * ngu_coef # self.ngu_coefs = torch.from_numpy(self.ngu_coefs).to(self.device) self.env = VecEpisodeLogger(env_fn(nenv=nenv)) self.rnd = RND(rnd_net, opt_rnd, gamma_int, self.env.observation_space.shape, self.device) self.ide = InverseDynamicsEmbedding(self.env, ide_embedding_net, ide_prediction_net, ide_loss, opt_ide, self.device) self.ngu = NGU(self.rnd, self.ide, ngu_buffer_capacity, self.device, gamma=gamma_int) self.env = VecActionRewardInObWrapper(NGUVecEnv(self.env, self.ngu), reward_shape=(2, )) self.pi = policy_fn(self.env).to(self.device) self.vf = value_fn(self.env).to(self.device) self.opt_pi = opt_pi(self.pi.parameters()) self.opt_vf = opt_vf(self.vf.parameters()) self.gamma = torch.Tensor([gamma_ext, gamma_int]).to(self.device) self.data_manager = RolloutDataManager(self.env, PPOActor(self.pi, self.vf), self.device, batch_size=batch_size, rollout_length=rollout_length, gamma=self.gamma, lambda_=lambda_, norm_advantages=False) self.buffer_size = buffer_size self.buffer = ReplayBuffer(buffer_size, 1) self.mse = nn.MSELoss() self.t = 0