def __init__(self, env, logger, device="cpu"): self.out_dim = out_dim = env.action_space.n self.device = device self.logger = logger self.hyperparameters = hyperparameters = default_hyperparameters self.model = hyperparameters['model_constructor'](env).to(device) writer = DummyWriter() self.epsilon = LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters['n_envs'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer ) policy = ParallelGreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], name="exploration", writer=writer ) ) return VQN(q, policy, discount_factor=self.hyperparameters['discount_factor'])
def _vqn(envs, writer=DummyWriter()): action_repeat = 4 final_exploration_timestep = final_exploration_frame / action_repeat env = envs[0] model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork( model, optimizer, writer=writer ) policy = ParallelGreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_timestep, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, discount_factor=discount_factor), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QNetwork(self.model, optimizer, target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer)) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], )
def _ddqn(env, writer=DummyWriter()): model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, target=FixedTarget(target_update_frequency), writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) return DDQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _dqn(env, writer=DummyWriter()): model = fc_relu_q(env).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QNetwork(model, optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=mse_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DQN(q, policy, replay_buffer, discount_factor=discount_factor, replay_start_size=replay_start_size, update_frequency=update_frequency, minibatch_size=minibatch_size)
def _c51(env, writer=DummyWriter()): model = fc_relu_dist_q(env, atoms=atoms).to(device) optimizer = Adam(model.parameters(), lr=lr) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, writer=writer, ) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return C51(q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer)
def _vqn(envs, writer=DummyWriter()): env = envs[0] model = nature_ddqn(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork( model, optimizer, env.action_space.n, loss=smooth_l1_loss, writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_frame, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, gamma=discount_factor), )
def agent_constructor(writer): policy = GreedyPolicy( q, n_actions, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr']) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device) return C51(q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer)
def _dqn(env, writer=DummyWriter()): _model = nature_dqn(env).to(device) _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), )
def _ppo(envs, writer=DummyWriter()): env = envs[0] # Update epoch * minibatches times per update, # but we only update once per n_steps, # with n_envs and 4 frames per step final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters["clip_initial"], self.hyperparameters["clip_final"], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters["epochs"], minibatches=self.hyperparameters["minibatches"], n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], lam=self.hyperparameters["lam"], entropy_loss_scaling=self. hyperparameters["entropy_loss_scaling"], writer=writer, ))
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs) value_model = value_head().to(device) policy_model = policy_head(env).to(device) feature_model = conv_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return FrameStack( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=LinearScheduler(entropy_loss_scaling, 0., 0, final_anneal_step, name="entropy_loss_scaling", writer=writer), writer=writer ), size=4 )
def _ppo(envs, writer=DummyWriter()): final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs) env = envs[0] feature_model, value_model, policy_model = fc_actor_critic(env) feature_model.to(device) value_model.to(device) policy_model.to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = GaussianPolicy( policy_model, policy_optimizer, env.action_space, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) features = Identity(self.device) v = VNetwork( self.value_model, value_optimizer, loss_scaling=self.hyperparameters['value_loss_scaling'], clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), ) policy = GaussianPolicy( self.policy_model, policy_optimizer, self.action_space, clip_grad=self.hyperparameters['clip_grad'], writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), ) return TimeFeature( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters['clip_initial'], self.hyperparameters['clip_final'], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters['epochs'], minibatches=self.hyperparameters['minibatches'], n_envs=self.hyperparameters['n_envs'], n_steps=self.hyperparameters['n_steps'], discount_factor=self.hyperparameters['discount_factor'], lam=self.hyperparameters['lam'], entropy_loss_scaling=self. hyperparameters['entropy_loss_scaling'], writer=writer, ))
def _dqn(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = nature_dqn(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QNetwork( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def _ddqn(env, writer=DummyWriter()): action_repeat = 1 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = model_constructor(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QNetwork( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, beta=beta, device=device ) else: replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DDQN(q, policy, replay_buffer, loss=weighted_smooth_l1_loss, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size'] ) / self.hyperparameters['update_frequency'] optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], scheduler=CosineAnnealingLR(optimizer, n_updates), v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer(self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device)) return DeepmindAtariBody(Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer), discount_factor=self.hyperparameters['discount_factor']** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, ), lazy_frames=True, episodic_lives=True)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], loss=smooth_l1_loss, minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer ), lazy_frames=True, episodic_lives=True )
def _c51(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = nature_c51(env, atoms=atoms).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), scheduler=CosineAnnealingLR(optimizer, last_update), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, 0, last_timestep, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ), lazy_frames=True )
def _rainbow(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QDist( model, optimizer, env.action_space.n, atoms, scheduler=CosineAnnealingLR(optimizer, last_update), v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) agent = Rainbow( q, replay_buffer, exploration=LinearScheduler(initial_exploration, final_exploration, 0, last_timestep, name='exploration', writer=writer), discount_factor=discount_factor**n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, ) return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
class DQNPolicy: def __init__(self, env, logger, device="cpu"): self.out_dim = out_dim = env.action_space.n self.device = device self.logger = logger self.hyperparameters = hyperparameters = default_hyperparameters self.model = hyperparameters['model_constructor'](env).to(device) writer = DummyWriter() self.epsilon = LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer) def calc_action(self, observations): with torch.no_grad(): observations = torch.tensor(observations, device=self.device) greedy = torch.argmax(self.model(observations), axis=1) #.detach().cpu().numpy() rand_vals = torch.randint(self.out_dim, size=(len(observations), ), device=self.device) epsilon = self.epsilon._get_value() self.logger.record("epsilon", epsilon) pick_rand = torch.rand( (len(observations), ), device=self.device) < epsilon actions = torch.where(pick_rand, rand_vals, greedy) return actions.cpu().detach().numpy() def get_params(self): return [ param.cpu().detach().numpy() for param in self.model.parameters() ] def set_params(self, params): for source, dest in zip(params, self.model.parameters()): dest.data = torch.tensor(source, device=self.device)
def _c51(env, writer=DummyWriter()): model = nature_c51(env, atoms=51).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ) )
def agent_constructor(writer): return DeepmindAtariBody( Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, ), lazy_frames=True, episodic_lives=True )
def test_linear_scheduler(self): obj = Obj() obj.attr = LinearScheduler(10, 0, 3, 13) expected = [10, 10, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0] actual = [obj.attr for _ in expected] np.testing.assert_allclose(actual, expected)
def _ppo(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam( feature_model.parameters(), lr=lr, eps=eps ) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, eta_min=lr * min_lr_scale ), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler( clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer ), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, ) )