def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters["min_batch_size"] feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr_v"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr_pi"], eps=self.hyperparameters["eps"]) features = FeatureNetwork( self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR(feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) v = VNetwork( self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer ) policy = SoftmaxPolicy( self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR(policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer ) return DeepmindAtariBody( VPG(features, v, policy, discount_factor=self.hyperparameters["discount_factor"], min_batch_size=self.hyperparameters["min_batch_size"]), )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps / self.hyperparameters['n_envs'] optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) q = QNetwork(self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer) policy = ParallelGreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] / self.hyperparameters["n_envs"], name="exploration", writer=writer)) return DeepmindAtariBody( VQN(q, policy, discount_factor=self.hyperparameters['discount_factor']), )
def test_agent(self): q = QNetwork(copy.deepcopy(self.model)) return DeepmindAtariBody( DDQNTestAgent( q, self.n_actions, exploration=self.hyperparameters['test_exploration']))
def _vac(envs, writer=DummyWriter()): value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(envs[0]).to(device) feature_model = feature_model_constructor().to(device) value_optimizer = Adam(value_model.parameters(), lr=lr_v, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi, eps=eps) feature_optimizer = Adam(feature_model.parameters(), lr=lr_pi, eps=eps) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, ) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) return DeepmindAtariBody( VAC(features, v, policy, discount_factor=discount_factor), )
def _model_predictive_dqn(env, writer=None): # models feature_model = shared_feature_layers().to(device) value_model = value_head().to(device) reward_model = reward_head(env).to(device) generator_model = Generator(env).to(device) # optimizers feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) reward_optimizer = Adam(reward_model.parameters(), lr=lr, eps=eps) generator_optimizer = Adam(generator_model.parameters(), lr=lr, eps=eps) # approximators f = FeatureNetwork(feature_model, feature_optimizer, writer=writer) v = VNetwork(value_model, value_optimizer, writer=writer) r = QNetwork(reward_model, reward_optimizer, name='reward', writer=writer) g = Approximation(generator_model, generator_optimizer, name='generator', writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) # create agent agent = ModelPredictiveDQN(f, v, r, g, replay_buffer, minibatch_size=minibatch_size, replay_start_size=replay_start_size ) # apply agent wrappers for better atari performance return DeepmindAtariBody(agent, lazy_frames=True)
def _vqn(envs, writer=DummyWriter()): env = envs[0] model = nature_ddqn(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork( model, optimizer, env.action_space.n, loss=smooth_l1_loss, writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_frame, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, gamma=discount_factor), )
def _vqn(envs, writer=DummyWriter()): action_repeat = 4 final_exploration_timestep = final_exploration_frame / action_repeat env = envs[0] model = model_constructor(env).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QNetwork( model, optimizer, writer=writer ) policy = ParallelGreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, 0, final_exploration_timestep, name="epsilon", writer=writer ) ) return DeepmindAtariBody( VQN(q, policy, discount_factor=discount_factor), )
def agent_constructor(writer): policy = GreedyPolicy( q, n_actions, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def _dqn(env, writer=DummyWriter()): _model = model _optimizer = optimizer if _model is None: _model = conv_net(env, frames=agent_history_length).to(device) if _optimizer is None: _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target_update_frequency=target_update_frequency, loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, annealing_start=replay_start_size, annealing_time=final_exploration_frame - replay_start_size, initial_epsilon=initial_exploration, final_epsilon=final_exploration) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody(DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), env, action_repeat=action_repeat, frame_stack=agent_history_length, noop_max=noop_max)
def _dqn(env, writer=DummyWriter()): _model = nature_dqn(env).to(device) _optimizer = Adam(_model.parameters(), lr=lr, eps=eps) q = QNetwork(_model, _optimizer, env.action_space.n, target=FixedTarget(target_update_frequency), loss=smooth_l1_loss, writer=writer) policy = GreedyPolicy(q, env.action_space.n, epsilon=LinearScheduler(initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer)) replay_buffer = ExperienceReplayBuffer(replay_buffer_size, device=device) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), )
def parallel_test_agent(self): q = QNetwork(copy.deepcopy(self.model)) policy = ParallelGreedyPolicy( q, self.n_actions, epsilon=self.hyperparameters['test_exploration']) return DeepmindAtariBody(VQNTestAgent(policy))
def _ppo(envs, writer=DummyWriter()): env = envs[0] # Update epoch * minibatches times per update, # but we only update once per n_steps, # with n_envs and 4 frames per step final_anneal_step = last_frame * epochs * minibatches / (n_steps * n_envs * 4) value_model = value_model_constructor().to(device) policy_model = policy_model_constructor(env).to(device) feature_model = feature_model_constructor().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step), writer=writer) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step), ) policy = SoftmaxPolicy( policy_model, policy_optimizer, clip_grad=clip_grad, writer=writer, scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step), ) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(clip_initial, clip_final, 0, final_anneal_step, name='clip', writer=writer), epochs=epochs, minibatches=minibatches, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, lam=lam, entropy_loss_scaling=entropy_loss_scaling, writer=writer, ))
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = train_steps * self.hyperparameters[ 'epochs'] * self.hyperparameters['minibatches'] / ( self.hyperparameters['n_steps'] * self.hyperparameters['n_envs']) feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) policy_optimizer = Adam(self.policy_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) features = FeatureNetwork(self.feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) v = VNetwork(self.value_model, value_optimizer, scheduler=CosineAnnealingLR(value_optimizer, n_updates), loss_scaling=self.hyperparameters["value_loss_scaling"], clip_grad=self.hyperparameters["clip_grad"], writer=writer) policy = SoftmaxPolicy(self.policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, n_updates), clip_grad=self.hyperparameters["clip_grad"], writer=writer) return DeepmindAtariBody( PPO( features, v, policy, epsilon=LinearScheduler(self.hyperparameters["clip_initial"], self.hyperparameters["clip_final"], 0, n_updates, name='clip', writer=writer), epochs=self.hyperparameters["epochs"], minibatches=self.hyperparameters["minibatches"], n_envs=self.hyperparameters["n_envs"], n_steps=self.hyperparameters["n_steps"], discount_factor=self.hyperparameters["discount_factor"], lam=self.hyperparameters["lam"], entropy_loss_scaling=self. hyperparameters["entropy_loss_scaling"], writer=writer, ))
def _a2c(envs, writer=DummyWriter()): env = envs[0] final_anneal_step = last_frame / (n_steps * n_envs * 4) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork( feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, entropy_loss_scaling=entropy_loss_scaling, writer=writer ), )
def agent_constructor(): q_dist = QDist( self.model, None, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], ) return DeepmindAtariBody(RainbowTestAgent(q_dist, self.n_actions, self.hyperparameters["test_exploration"]))
def test_agent(self): q_dist = QDist( copy.deepcopy(self.model), None, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], ) return DeepmindAtariBody(C51TestAgent(q_dist, self.n_actions, self.hyperparameters["test_exploration"]))
def _dqn(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency final_exploration_step = final_exploration_frame / action_repeat model = nature_dqn(env).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QNetwork( model, optimizer, scheduler=CosineAnnealingLR(optimizer, last_update), target=FixedTarget(target_update_frequency), writer=writer ) policy = GreedyPolicy( q, env.action_space.n, epsilon=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_step - replay_start_size, name="epsilon", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=discount_factor, loss=smooth_l1_loss, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QNetwork( self.model, optimizer, scheduler=CosineAnnealingLR(optimizer, n_updates), target=FixedTarget(self.hyperparameters['target_update_frequency']), writer=writer ) policy = GreedyPolicy( q, self.n_actions, epsilon=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], self.hyperparameters['replay_start_size'], self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ) ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( DQN( q, policy, replay_buffer, discount_factor=self.hyperparameters['discount_factor'], loss=smooth_l1_loss, minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], ), lazy_frames=True )
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size'] ) / self.hyperparameters['update_frequency'] optimizer = Adam(self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps']) q_dist = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], scheduler=CosineAnnealingLR(optimizer, n_updates), v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget( self.hyperparameters['target_update_frequency']), writer=writer, ) replay_buffer = NStepReplayBuffer( self.hyperparameters['n_steps'], self.hyperparameters['discount_factor'], PrioritizedReplayBuffer(self.hyperparameters['replay_buffer_size'], alpha=self.hyperparameters['alpha'], beta=self.hyperparameters['beta'], device=self.device)) return DeepmindAtariBody(Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer), discount_factor=self.hyperparameters['discount_factor']** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, ), lazy_frames=True, episodic_lives=True)
def agent(self, writer=DummyWriter(), train_steps=float('inf')): n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency'] optimizer = Adam( self.model.parameters(), lr=self.hyperparameters['lr'], eps=self.hyperparameters['eps'] ) q = QDist( self.model, optimizer, self.n_actions, self.hyperparameters['atoms'], v_min=self.hyperparameters['v_min'], v_max=self.hyperparameters['v_max'], target=FixedTarget(self.hyperparameters['target_update_frequency']), scheduler=CosineAnnealingLR(optimizer, n_updates), writer=writer, ) replay_buffer = ExperienceReplayBuffer( self.hyperparameters['replay_buffer_size'], device=self.device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"], name="epsilon", writer=writer, ), discount_factor=self.hyperparameters["discount_factor"], minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"], update_frequency=self.hyperparameters["update_frequency"], writer=writer ), lazy_frames=True, episodic_lives=True )
def _a2c(envs, writer=DummyWriter()): env = envs[0] value_model = nature_value_head().to(device) policy_model = nature_policy_head(envs[0]).to(device) feature_model = nature_features().to(device) feature_optimizer = RMSprop( feature_model.parameters(), alpha=alpha, lr=lr, eps=eps ) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop( policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) features = FeatureNetwork( feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer ) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer ) return DeepmindAtariBody( A2C( features, v, policy, n_envs=n_envs, n_steps=n_steps, discount_factor=discount_factor, ), )
def _c51(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = nature_c51(env, atoms=atoms).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), scheduler=CosineAnnealingLR(optimizer, last_update), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, 0, last_timestep, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ), lazy_frames=True )
def main(): parser = argparse.ArgumentParser(description="Run an Atari benchmark.") parser.add_argument("env", help="Name of the Atari game (e.g. Pong)") parser.add_argument("dir", help="Directory where the agent's model was saved.") parser.add_argument( "--device", default="cpu", help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) parser.add_argument( "--fps", default=60, help="Playback speed", ) args = parser.parse_args() env = AtariEnvironment(args.env, device=args.device) agent = DeepmindAtariBody(GreedyAgent.load(args.dir, env)) watch(agent, env, fps=args.fps)
def _vpg_atari(env, writer=DummyWriter()): feature_model = nature_features().to(device) value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_optimizer = RMSprop(feature_model.parameters(), alpha=alpha, lr=lr * feature_lr_scaling, eps=eps) value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps) policy_optimizer = RMSprop(policy_model.parameters(), alpha=alpha, lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy( policy_model, policy_optimizer, env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, ) return DeepmindAtariBody( VPG(features, v, policy, gamma=discount_factor, min_batch_size=min_batch_size), )
def _vpg_atari(env, writer=DummyWriter()): value_model = nature_value_head().to(device) policy_model = nature_policy_head(env).to(device) feature_model = nature_features().to(device) feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) features = FeatureNetwork(feature_model, feature_optimizer, scheduler=CosineAnnealingLR( feature_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) v = VNetwork(value_model, value_optimizer, scheduler=CosineAnnealingLR( value_optimizer, final_anneal_step, ), loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer) policy = SoftmaxPolicy(policy_model, policy_optimizer, scheduler=CosineAnnealingLR( policy_optimizer, final_anneal_step, ), clip_grad=clip_grad, writer=writer) return DeepmindAtariBody(VPG(features, v, policy, discount_factor=discount_factor, min_batch_size=min_batch_size), episodic_lives=True)
def _rainbow(env, writer=DummyWriter()): action_repeat = 4 last_timestep = last_frame / action_repeat last_update = (last_timestep - replay_start_size) / update_frequency model = model_constructor(env, atoms=atoms, sigma=sigma).to(device) optimizer = Adam(model.parameters(), lr=lr, eps=eps) q = QDist( model, optimizer, env.action_space.n, atoms, scheduler=CosineAnnealingLR(optimizer, last_update), v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = PrioritizedReplayBuffer(replay_buffer_size, alpha=alpha, beta=beta, device=device) replay_buffer = NStepReplayBuffer(n_steps, discount_factor, replay_buffer) agent = Rainbow( q, replay_buffer, exploration=LinearScheduler(initial_exploration, final_exploration, 0, last_timestep, name='exploration', writer=writer), discount_factor=discount_factor**n_steps, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer, ) return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
def _c51(env, writer=DummyWriter()): model = nature_c51(env, atoms=51).to(device) optimizer = Adam( model.parameters(), lr=lr, eps=eps ) q = QDist( model, optimizer, env.action_space.n, atoms, v_min=v_min, v_max=v_max, target=FixedTarget(target_update_frequency), writer=writer, ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device ) return DeepmindAtariBody( C51( q, replay_buffer, exploration=LinearScheduler( initial_exploration, final_exploration, replay_start_size, final_exploration_frame, name="epsilon", writer=writer, ), discount_factor=discount_factor, minibatch_size=minibatch_size, replay_start_size=replay_start_size, update_frequency=update_frequency, writer=writer ) )
def agent_constructor(writer): return DeepmindAtariBody( Rainbow( q_dist, replay_buffer, exploration=LinearScheduler( self.hyperparameters['initial_exploration'], self.hyperparameters['final_exploration'], 0, train_steps - self.hyperparameters['replay_start_size'], name="exploration", writer=writer ), discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"], minibatch_size=self.hyperparameters['minibatch_size'], replay_start_size=self.hyperparameters['replay_start_size'], update_frequency=self.hyperparameters['update_frequency'], writer=writer, ), lazy_frames=True, episodic_lives=True )
def agent(self, writer=DummyWriter(), train_steps=float("inf")): # optimizers feature_optimizer = Adam(self.feature_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) value_optimizer = Adam(self.value_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) reward_optimizer = Adam(self.reward_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) generator_optimizer = Adam(self.generator_model.parameters(), lr=self.hyperparameters["lr"], eps=self.hyperparameters["eps"]) # approximators f = FeatureNetwork(self.feature_model, feature_optimizer, writer=writer) v = VNetwork(self.value_model, value_optimizer, writer=writer) r = QNetwork(self.reward_model, reward_optimizer, name="reward", writer=writer) g = Approximation(self.generator_model, generator_optimizer, name="generator", writer=writer) # replay buffer replay_buffer = ExperienceReplayBuffer(self.hyperparameters["replay_buffer_size"], device=self.device) # create agent agent = ModelBasedDQN(f, v, r, g, replay_buffer, minibatch_size=self.hyperparameters["minibatch_size"], replay_start_size=self.hyperparameters["replay_start_size"] ) # apply atari wrappers for better performance return DeepmindAtariBody(agent, lazy_frames=True)
def test_agent(self): features = FeatureNetwork(copy.deepcopy(self.feature_model)) policy = SoftmaxPolicy(copy.deepcopy(self.policy_model)) return DeepmindAtariBody(VACTestAgent(features, policy))