class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent): def __call__(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) return q.cpu() def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) self.target_model.load_state_dict(self.model.state_dict()) self.distribution = EpsilonGreedy(dim=env_spaces.action.n) if env_ranks is not None: self.make_vec_eps(global_B, env_ranks) def to_device(self, cuda_idx=None): super().to_device(cuda_idx) self.target_model.to(self.device) def state_dict(self): return dict(model=self.model.state_dict(), target=self.target_model.state_dict()) @torch.no_grad() def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) q = q.cpu() action = self.distribution.sample(q) agent_info = AgentInfo(q=q) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) def target(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) target_q = self.target_model(*model_inputs) return target_q.cpu() def update_target(self, tau=1): update_state_dict(self.target_model, self.model.state_dict(), tau)
class AtariDqnAgent(EpsilonGreedyAgentMixin, BaseAgent): """ Standard agent for DQN algorithms with epsilon-greedy exploration. """ def __init__(self, ModelCls=AtariDqnModel, model_kwargs=None, load_conv=False, load_all=False, state_dict_filename=None, store_latent=False, **kwargs): if model_kwargs is None: model_kwargs = dict() assert not (load_conv and load_all) save__init__args(locals()) super().__init__(ModelCls=ModelCls, **kwargs) def __call__(self, observation, prev_action, prev_reward): """Returns Q-values for states/observations (with grad).""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q, _conv = self.model(*model_inputs) return q.cpu() def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """Along with standard initialization, creates vector-valued epsilon for exploration, if applicable, with a different epsilon for each environment instance.""" self.model = self.ModelCls(image_shape=env_spaces.observation.shape, action_size=env_spaces.action.n, **self.model_kwargs) if self.load_conv: logger.log("Agent loading state dict: " + self.state_dict_filename) loaded_state_dict = torch.load(self.state_dict_filename, map_location=torch.device('cpu')) # From UL, saves snapshot: params["algo_state_dict"]["encoder"] loaded_state_dict = loaded_state_dict.get("algo_state_dict", loaded_state_dict) loaded_state_dict = loaded_state_dict.get("encoder", loaded_state_dict) # A bit onerous, but ensures that state dicts match: conv_state_dict = OrderedDict([ (k.replace("conv.", "", 1), v) for k, v in loaded_state_dict.items() if k.startswith("conv.") ]) self.model.conv.load_state_dict(conv_state_dict) logger.log("Agent loaded CONV state dict.") elif self.load_all: # From RL, saves snapshot: params["agent_state_dict"] loaded_state_dict = torch.load(self.state_dict_filename, map_location=torch.device('cpu')) self.load_state_dict(loaded_state_dict["agent_state_dict"]) logger.log("Agnet loaded FULL state dict.") else: logger.log("Agent NOT loading state dict.") self.target_model = copy.deepcopy(self.model) self.distribution = EpsilonGreedy(dim=env_spaces.action.n) if env_ranks is not None: self.make_vec_eps(global_B, env_ranks) if share_memory: self.model.share_memory() self.shared_model = self.model if self.initial_model_state_dict is not None: raise NotImplementedError self.env_spaces = env_spaces self.share_memory = share_memory def to_device(self, cuda_idx=None): super().to_device(cuda_idx) self.target_model.to(self.device) def state_dict(self): return dict(model=self.model.state_dict(), target=self.target_model.state_dict()) @torch.no_grad() def step(self, observation, prev_action, prev_reward): """Computes Q-values for states/observations and selects actions by epsilon-greedy. (no grad)""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q, conv = self.model(*model_inputs) q = q.cpu() action = self.distribution.sample(q) agent_info = AgentInfoConv(q=q, conv=conv if self.store_latent else None) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) def target(self, observation, prev_action, prev_reward): """Returns the target Q-values for states/observations.""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) target_q, _conv = self.target_model(*model_inputs) return target_q.cpu() def update_target(self, tau=1): """Copies the model parameters into the target model.""" update_state_dict(self.target_model, self.model.state_dict(), tau)
class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent): def __call__(self, observation, prev_action, prev_reward): """ __call__使得一个class可以像一个method一样调用,即:假设agent为DqnAgent的一个对象,那么agent(observation, prev_action, prev_reward)就等同于调用agent.__call__(observation, prev_action, prev_reward) """ prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) # torch.nn.Module子类的实例,使用torch.nn.Module里定义的__call__调用,相当于计算模型输出(一个Tensor) return q.cpu() # 将tensor移动到CPU(内存) def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """ 初始化agent。这个函数在Sampler类(例如SerialSampler)中的 initialize() 里会被调用。 :param env_spaces: 参考 Env.spaces(),类型为 EnvSpaces 这样一个 namedtuple,包含observation space 和 action space两个属性。 :param share_memory: 为 True 时使得模型参数可以在多进程间共享,为 False 时不共享。 :param global_B: 在BatchSpec中,表示独立的trajectory的数量,即environment实例的数量。这里的global_B可能是指所有env的总数 :param env_ranks: 其含义参考我写的文章 https://www.codelast.com/?p=10932 """ super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) # torch.nn.Module的子类 self.target_model.load_state_dict(self.model.state_dict()) # 加载PyTorch模型,开始的时候target network和main network一致 self.distribution = EpsilonGreedy(dim=env_spaces.action.n) # 按ε-greedy方法来探索,n是action的维度 if env_ranks is not None: self.make_vec_eps(global_B, env_ranks) def to_device(self, cuda_idx=None): """ 指定把模型数据(parameter和buffer)放在什么设备上(CPU/GPU)。 父类是指定self.model的数据放在哪个GPU上,在本子类中是指定self.target_model的数据放在哪。 :param cuda_idx: GPU编号 """ super().to_device(cuda_idx) self.target_model.to(self.device) # self.device在初始化的时候已经写死了是CPU,因此这里指定在CPU上运行 def state_dict(self): """ 返回main network和target network两个网络的state数据。例如网络的weight,bias等。 :return: 一个dict。 """ return dict(model=self.model.state_dict(), target=self.target_model.state_dict()) @torch.no_grad() def step(self, observation, prev_action, prev_reward): """ 在environment中走一步。environment类(例如AtariEnv也有一个step(),那个step()和这里的step()的主要区别是:这里的step()根据policy network选取了一个action,而AtariEnv里的step()输入的action是已经选取好的action,并且AtariEnv的step()会计算reward,记录一些统计 信息等,这里的step()不会去计算reward。 这个函数在Collector类的collect_batch()函数中会被调用。 这里会发生policy network的前向传播过程(比较耗计算资源的操作),即根据输入(例如observation)计算下一步要采取的action。 :param observation: 其义自明。 :param prev_action: 前一个action。 :param prev_reward: 之前累积的reward。 :return: 要采取的action(类型为torch.Tensor),以及agent的信息(例如Q值) """ prev_action = self.distribution.to_onehot(prev_action) # 返回类型为 torch.Tensor model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) # 策略网络的输入(torch.Tensor) q = self.model(*model_inputs) # self.model是torch.nn.Module的子类对象,这里是输入特征计算网络的输出,因此会发生NN的forward过程 q = q.cpu() # 把tensor移到CPU(内存),返回torch.Tensor action = self.distribution.sample(q) # 选择一个action(torch.Tensor) agent_info = AgentInfo(q=q) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) def target(self, observation, prev_action, prev_reward): """ 计算Q值。 :param observation: 如其名。 :param prev_action: 前一个action。 :param prev_reward: 前一个reward。 :return: CPU(内存)里的Q值对应的Tensor。 """ prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) # 计算Q值,self.target_model是一个torch.nn.Module子类的实例,使用torch.nn.Module里定义的__call__调用,相当于计算模型输出(一个Tensor) target_q = self.target_model(*model_inputs) return target_q.cpu() # 将tensor移动到CPU(内存) def update_target(self, tau=1): """ 更新target network,即把main network(self.model)的参数拷贝到target network(self.target_model)上。 当τ>0的时候会使用soft update算法来更新参数。 为了保持learning过程的稳定性以及高效性,target network不是实时更新,而是周期性地更新一次。例如,在DQN.optimize_agent()函数中, 会看到每隔一定的周期才会调用一次本函数的代码逻辑。 :param tau: soft update算法里的τ参数。 """ update_state_dict(self.target_model, self.model.state_dict(), tau)
class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent): """ Standard agent for DQN algorithms with epsilon-greedy exploration. """ def __call__(self, observation, prev_action, prev_reward): """Returns Q-values for states/observations (with grad).""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) return q.cpu() def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """Along with standard initialization, creates vector-valued epsilon for exploration, if applicable, with a different epsilon for each environment instance.""" super().initialize(env_spaces, share_memory, global_B=global_B, env_ranks=env_ranks) self.target_model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) self.target_model.load_state_dict(self.model.state_dict()) self.distribution = EpsilonGreedy(dim=env_spaces.action.n) if env_ranks is not None: self.make_vec_eps(global_B, env_ranks) def to_device(self, cuda_idx=None): super().to_device(cuda_idx) self.target_model.to(self.device) def state_dict(self): return dict(model=self.model.state_dict(), target=self.target_model.state_dict()) @torch.no_grad() def step(self, observation, prev_action, prev_reward): """Computes Q-values for states/observations and selects actions by epsilon-greedy. (no grad)""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) q = q.cpu() action = self.distribution.sample(q) agent_info = AgentInfo(q=q) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) def target(self, observation, prev_action, prev_reward): """Returns the target Q-values for states/observations.""" prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) target_q = self.target_model(*model_inputs) return target_q.cpu() def update_target(self, tau=1): """Copies the model parameters into the target model.""" update_state_dict(self.target_model, self.model.state_dict(), tau)
class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent): def __call__(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) return q.cpu() def initialize(self, env_spaces, share_memory=False): env_model_kwargs = self.make_env_to_model_kwargs(env_spaces) self.model = self.ModelCls(**env_model_kwargs, **self.model_kwargs) if share_memory: self.model.share_memory() self.shared_model = self.model if self.initial_model_state_dict is not None: self.model.load_state_dict(self.initial_model_state_dict) self.target_model = self.ModelCls(**env_model_kwargs, **self.model_kwargs) self.target_model.load_state_dict(self.model.state_dict()) self.distribution = EpsilonGreedy(dim=env_spaces.action.n) self.env_spaces = env_spaces self.env_model_kwargs = env_model_kwargs self.share_memory = share_memory super().initialize(env_spaces, share_memory) def initialize_cuda(self, cuda_idx=None, ddp=False): if cuda_idx is None: return # CPU if self.shared_model is not None: self.model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) self.model.load_state_dict(self.shared_model.state_dict()) self.device = torch.device("cuda", index=cuda_idx) self.model.to(self.device) if ddp: self.model = DDP(self.model, device_ids=[cuda_idx], output_device=cuda_idx) logger.log("Initialized DistributedDataParallel agent model " f"on device: {self.device}.") else: logger.log(f"Initialized agent model on device: {self.device}.") self.target_model.to(self.device) def make_env_to_model_kwargs(self, env_spaces): raise NotImplementedError def state_dict(self): return dict(model=self.model.state_dict(), target=self.target_model.state_dict()) @torch.no_grad() def step(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) q = q.cpu() action = self.distribution.sample(q) agent_info = AgentInfo(q=q) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) def target(self, observation, prev_action, prev_reward): prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) target_q = self.target_model(*model_inputs) return target_q.cpu() def update_target(self): self.target_model.load_state_dict(self.model.state_dict())