class AtariDreamerModel(AgentModel): def forward(self, observation: torch.Tensor, prev_action: torch.Tensor = None, prev_state: RSSMState = None): lead_dim, T, B, img_shape = infer_leading_dims(observation, 3) observation = observation.reshape(T * B, *img_shape).type( self.dtype) / 255.0 - 0.5 prev_action = to_onehot(prev_action.reshape(T * B, ), self.action_size, dtype=self.dtype) if prev_state is None: prev_state = self.representation.initial_state( prev_action.size(0), device=prev_action.device, dtype=self.dtype) state = self.get_state_representation(observation, prev_action, prev_state) action, action_dist = self.policy(state) action = from_onehot(action) return_spec = ModelReturnSpec(action, state) return_spec = buffer_func(return_spec, restore_leading_dims, lead_dim, T, B) return return_spec ModelReturnSpec = namedarraytuple('ModelReturnSpec', ['action', 'state'])
import torch import math from rlpyt.distributions.base import Distribution from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.tensor import valid_mean EPS = 1e-8 DistInfo = namedarraytuple("DistInfo", ["mean"]) DistInfoStd = namedarraytuple("DistInfoStd", ["mean", "log_std"]) class Gaussian(Distribution): """Multivariate Gaussian with independent variables (diagonal covariance). Standard deviation can be provided, as scalar or value per dimension, or it will be drawn from the dist_info (possibly learnable), where it is expected to have a value per each dimension. Noise clipping or sample clipping optional during sampling, but not accounted for in formulas (e.g. entropy). Clipping of standard deviation optional and accounted in formulas. Squashing of samples to squash * tanh(sample) is optional and accounted for in log_likelihood formula but not entropy. """ def __init__( self, dim, std=None, clip=None, noise_clip=None, min_std=None,
import torch from collections import namedtuple from rlpyt.algos.base import RlAlgorithm from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.logging import logger from rlpyt.replays.non_sequence.frame import (UniformReplayFrameBuffer, PrioritizedReplayFrameBuffer, AsyncUniformReplayFrameBuffer, AsyncPrioritizedReplayFrameBuffer) from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.tensor import select_at_indexes, valid_mean from rlpyt.algos.utils import valid_from_done OptInfo = namedtuple("OptInfo", ["loss", "gradNorm", "tdAbsErr"]) SamplesToBuffer = namedarraytuple("SamplesToBuffer", ["observation", "action", "reward", "done"]) class DQN(RlAlgorithm): """ DQN algorithm trainig from a replay buffer, with options for double-dqn, n-step returns, and prioritized replay. """ opt_info_fields = tuple(f for f in OptInfo._fields) # copy def __init__( self, discount=0.99, batch_size=32, min_steps_learn=int(5e4),
from torch.nn.parallel import DistributedDataParallel as DDP # from torch.nn.parallel import DistributedDataParallelCPU as DDPC # Deprecated from rlpyt.agents.base import BaseAgent, AgentStep from rlpyt.models.qpg.mlp import QofMuMlpModel, PiMlpModel from rlpyt.utils.quick_args import save__init__args from rlpyt.distributions.gaussian import Gaussian, DistInfoStd from rlpyt.utils.buffer import buffer_to from rlpyt.utils.logging import logger from rlpyt.models.utils import update_state_dict from rlpyt.utils.collections import namedarraytuple MIN_LOG_STD = -20 MAX_LOG_STD = 2 AgentInfo = namedarraytuple("AgentInfo", ["dist_info"]) Models = namedtuple("Models", ["pi", "q1", "q2", "v"]) class SacAgent(BaseAgent): """Agent for SAC algorithm, including action-squashing, using twin Q-values.""" def __init__( self, ModelCls=PiMlpModel, # Pi model. QModelCls=QofMuMlpModel, model_kwargs=None, # Pi model. q_model_kwargs=None, v_model_kwargs=None, initial_model_state_dict=None, # All models. pretrain_std=0.75, # With squash 0.75 is near uniform. ):
import torch import torch.nn.functional as F from rlpyt.models.mlp import MlpModel from rlpyt.ul.models.dmlab_conv2d import DmlabConv2dModel from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.logging import logger from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims RnnState = namedarraytuple( "RnnState", ["h", "c"]) # For downstream namedarraytuples to work def weight_init(m): if isinstance(m, (torch.nn.Linear, torch.nn.Conv2d)): torch.nn.init.kaiming_normal_(m.weight, mode="fan_in", nonlinearity="relu") torch.nn.init.zeros_(m.bias) class DmlabPgLstmModel(torch.nn.Module): def __init__( self, image_shape, output_size, lstm_size, skip_connections=True, hidden_sizes=None, kiaming_init=True, stop_conv_grad=False,
import torch from rlpyt.agents.base import (AgentStep, RecurrentAgentMixin, AlternatingRecurrentAgentMixin) from rlpyt.agents.dqn.dqn_agent import DqnAgent from rlpyt.utils.buffer import buffer_to, buffer_func, buffer_method from rlpyt.utils.collections import namedarraytuple AgentInfo = namedarraytuple("AgentInfo", ["q", "prev_rnn_state"]) class R2d1AgentBase(DqnAgent): """Base agent for recurrent DQN (to add recurrent mixin).""" def __call__(self, observation, prev_action, prev_reward, init_rnn_state): # Assume init_rnn_state already shaped: [N,B,H] prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward, init_rnn_state), device=self.device) output = self.model(*model_inputs) # q, rnn_state return output # Leave rnn state on device. def to_agent_step(self, output): """Convert the output of the NN model into step info for the agent. """ q, rnn_state = output # q = q.cpu() action = self.distribution.sample(q) prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like)
""" Methods to overwrite for the saved replay buffer, to return different samples than was used by the replay buffer object used to collect the samples. """ import numpy as np from rlpyt.utils.buffer import torchify_buffer, buffer_func from rlpyt.utils.misc import extract_sequences from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.logging import logger SamplesFromReplay = namedarraytuple("SamplesFromReplay", ["observation", "action", "reward", "done", "prev_action", "prev_reward"]) SamplesFromReplayPC = namedarraytuple("SamplesFromReplayPC", SamplesFromReplay._fields + ("pixctl_return",)) class UlForRlReplayBuffer: def __init__( self, replay_buffer, replay_T=1, validation_split=0.0, pixel_control_buffer=None, ): self.load_replay(replay_buffer, pixel_control_buffer) self.replay_T = replay_T self.validation_t = int((self.T - replay_T) * (1 - validation_split)) if pixel_control_buffer is not None:
from abc import ABC from rlpyt.algos.pg.ppo import PPO from rlpyt.agents.base import AgentInputs from rlpyt.utils.tensor import valid_mean from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.buffer import buffer_to, buffer_method from rlpyt.utils.misc import iterate_mb_idxs from rlpyt.utils.collections import namedarraytuple from intrinsic_rl.algos.pg.base import IntrinsicPolicyGradientAlgo import cv2 ### LossInputs = namedarraytuple("LossInputs", [ "agent_inputs", "action", "next_obs", "ext_return", "ext_adv", "int_return", "int_adv", "valid", "old_dist_info" ]) OptInfo = namedarraytuple("OptInfo", [ "loss", "policyLoss", "valueLoss", "entropyLoss", "bonusLoss", "extrinsicValue", "intrinsicValue", "intrinsicReward", "discountedIntrinsicReturn", "gradNorm", "entropy", "perplexity", "meanObsRmsModel", "varObsRmsModel", "meanIntRetRmsModel", "varIntRetRmsModel" ]) class IntrinsicPPO(PPO, IntrinsicPolicyGradientAlgo, ABC): """ Abstract base class for PPO using an intrinsic bonus model. Must override abstract method ``extract_bonus_inputs`` based on specific intrinsic bonus model / algorithm to be used.
from gpytorch.mlls import ExactMarginalLogLikelihood from rlpyt.algos.base import RlAlgorithm from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.logging import logger from rlpyt.replays.model_based import ModelBasedBuffer from rlpyt.utils.collections import namedarraytuple from rlpyt.agents.base import AgentInputs from rlpyt.utils.tensor import valid_mean from rlpyt.utils.visom import VisdomLinePlotter from rlpyt.algos.utils import valid_from_done OptInfo = namedtuple("OptInfo", ["muLoss", "dLoss", "muGradNorm", "dGradNorm"]) SamplesToBuffer = namedarraytuple("SamplesToBuffer", ["observation", "prev_observation", "action", "reward", "done", "timeout"]) class GP_Mlp(RlAlgorithm): """Model-based algorithm that uses Gaussian Process to predict model and a deep neural network to control.""" opt_info_fields = tuple(f for f in OptInfo._fields) # copy def __init__( self, discount=0.99, batch_size=500, buffer_size=int(1e6), min_steps_learn=int(1e1), # very efficient target_update_tau=0.9, target_update_interval=5,
from rlpyt.utils.collections import namedarraytuple AgentInfo = namedarraytuple("AgentInfo", ["dist_info", "value"]) AgentInfoRnn = namedarraytuple("AgentInfoRnn", ["dist_info", "value", "prev_rnn_state"])
""" 这个class已经抽象到和具体的environment(例如Atari)无关,而它的子类还是有可能和具体的environment相关的。 """ import torch from rlpyt.agents.base import BaseAgent, AgentStep from rlpyt.agents.dqn.epsilon_greedy import EpsilonGreedyAgentMixin from rlpyt.distributions.epsilon_greedy import EpsilonGreedy from rlpyt.models.utils import strip_ddp_state_dict from rlpyt.utils.buffer import buffer_to from rlpyt.utils.collections import namedarraytuple from rlpyt.models.utils import update_state_dict AgentInfo = namedarraytuple("AgentInfo", "q") class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent): def __call__(self, observation, prev_action, prev_reward): """ __call__使得一个class可以像一个method一样调用,即:假设agent为DqnAgent的一个对象,那么agent(observation, prev_action, prev_reward)就等同于调用agent.__call__(observation, prev_action, prev_reward) """ prev_action = self.distribution.to_onehot(prev_action) model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device) q = self.model(*model_inputs) # torch.nn.Module子类的实例,使用torch.nn.Module里定义的__call__调用,相当于计算模型输出(一个Tensor) return q.cpu() # 将tensor移动到CPU(内存) def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None): """
import math import numpy as np from rlpyt.replays.n_step import BaseNStepReturnBuffer from rlpyt.utils.buffer import buffer_from_example, buffer_func, torchify_buffer from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.misc import extract_sequences SamplesFromReplay = namedarraytuple( "SamplesFromReplay", [ "all_observation", "all_action", "all_reward", "return_", "done", "done_n", "init_rnn_state", ], ) SamplesToBuffer = None class SequenceNStepReturnBuffer(BaseNStepReturnBuffer): """Base n-step return buffer for sequences replays. Includes storage of agent's recurrent (RNN) state. Use of ``rnn_state_interval>1`` only periodically stores RNN state, to save memory. The replay mechanism must account for the
from rlpyt.utils.collections import namedarraytuple AgentInfo = namedarraytuple("AgentInfo", ["dist_info", "value"]) AgentInfoTwin = namedarraytuple("AgentInfoTwin", ["dist_info", "dist_int_info", "value", "int_value"]) AgentInfoRnn = namedarraytuple("AgentInfoRnn", ["dist_info", "value", "prev_rnn_state"]) AgentInfoRnnTwin = namedarraytuple("AgentInfoRnnTwin", [ "dist_info", "dist_int_info", "value", "int_value", "prev_rnn_state", "prev_int_rnn_state"]) IcmInfo = namedarraytuple("IcmInfo", []) NdigoInfo = namedarraytuple("NdigoInfo", ["prev_gru_state"]) RndInfo = namedarraytuple("RndInfo", [])
import multiprocessing as mp import torch from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallelCPU as DDPC from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.synchronize import RWLock from rlpyt.utils.logging import logger from rlpyt.models.utils import strip_ddp_state_dict AgentInputs = namedarraytuple("AgentInputs", ["observation", "prev_action", "prev_reward"]) AgentStep = namedarraytuple("AgentStep", ["action", "agent_info"]) class BaseAgent: recurrent = False alternating = False def __init__(self, ModelCls=None, model_kwargs=None, initial_model_state_dict=None): save__init__args(locals()) self.model = None # type: torch.nn.Module self.shared_model = None self.distribution = None self.device = torch.device("cpu") self._mode = None if self.model_kwargs is None: self.model_kwargs = dict() # The rest only for async operations:
import numpy as np from collections import namedtuple from rlpyt.utils.collections import namedarraytuple, AttrDict Samples = namedarraytuple("Samples", ["agent", "env"]) AgentSamples = namedarraytuple("AgentSamples", ["action", "prev_action", "agent_info"]) AgentSamplesBsv = namedarraytuple( "AgentSamplesBsv", ["action", "prev_action", "agent_info", "bootstrap_value"]) EnvSamples = namedarraytuple("EnvSamples", [ "reward", "prev_reward", "observation", "next_observation", "done", "env_info" ]) class BatchSpec(namedtuple("BatchSpec", "T B")): """ T: int Number of time steps, >=1. B: int Number of separate trajectory segments (i.e. # env instances), >=1. """ __slots__ = () @property def size(self): return self.T * self.B class TrajInfo(AttrDict):
AsyncUniformReplayBuffer) from rlpyt.replays.non_sequence.time_limit import (TlUniformReplayBuffer, AsyncTlUniformReplayBuffer) from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.buffer import buffer_to from rlpyt.distributions.gaussian import Gaussian from rlpyt.distributions.gaussian import DistInfo as GaussianDistInfo from rlpyt.utils.tensor import valid_mean from rlpyt.algos.utils import valid_from_done OptInfo = namedtuple("OptInfo", ["q1Loss", "q2Loss", "piLoss", "q1GradNorm", "q2GradNorm", "piGradNorm", "q1", "q2", "piMu", "piLogStd", "qMeanDiff", "alpha"]) SamplesToBuffer = namedarraytuple("SamplesToBuffer", ["observation", "action", "reward", "done"]) SamplesToBufferTl = namedarraytuple("SamplesToBufferTl", SamplesToBuffer._fields + ("timeout",)) class SAC(RlAlgorithm): """Soft actor critic algorithm, training from a replay buffer.""" opt_info_fields = tuple(f for f in OptInfo._fields) # copy def __init__( self, discount=0.99, batch_size=256, min_steps_learn=int(1e4), replay_size=int(1e6),
import torch from torch.nn.parallel import DistributedDataParallel as DDP # from torch.nn.parallel import DistributedDataParallelCPU as DDPC # Deprecated from rlpyt.agents.base import BaseAgent, AgentStep from rlpyt.utils.quick_args import save__init__args from rlpyt.distributions.gaussian import Gaussian, DistInfo from rlpyt.utils.buffer import buffer_to from rlpyt.utils.logging import logger from rlpyt.models.qpg.mlp import MuMlpModel, QofMuMlpModel from rlpyt.models.utils import update_state_dict from rlpyt.utils.collections import namedarraytuple AgentInfo = namedarraytuple("AgentInfo", ["mu"]) class DdpgAgent(BaseAgent): """Agent for deep deterministic policy gradient algorithm.""" shared_mu_model = None def __init__( self, ModelCls=MuMlpModel, # Mu model. QModelCls=QofMuMlpModel, model_kwargs=None, # Mu model. q_model_kwargs=None, initial_model_state_dict=None, # Mu model. initial_q_model_state_dict=None,
from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims from ul_gen.algos.discrete_sac_ae import DiscreteSACAE from ul_gen.configs.discrete_sac_ae_config import configs from ul_gen.agents.discrete_sac_ae_agent import DiscreteSacAEAgent import argparse parser = argparse.ArgumentParser() parser.add_argument("--savepath", type=str, default="./ae_data/") args = parser.parse_args() os.makedirs(args.savepath, exist_ok=True) EmptyAgentInfo = namedarraytuple("EmptyAgentInfo", []) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") affinity_code = encode_affinity( n_cpu_core=4, n_gpu=1, n_socket=1, ) affinity = affinity_from_code(prepend_run_slot(0, affinity_code)) # Get Params config = configs["discrete_sac_ae"] # Setup the data collection pipeline # Edit the sampler kwargs to get a larger batch size config["sampler"]["batch_T"] = 24
import numpy as np from rlpyt.replays.n_step import BaseNStepReturnBuffer from rlpyt.agents.base import AgentInputs from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.buffer import torchify_buffer SamplesFromReplay = namedarraytuple( "SamplesFromReplay", ["agent_inputs", "action", "return_", "done", "done_n", "target_inputs"]) class NStepReturnBuffer(BaseNStepReturnBuffer): """Definition of what fields are replayed from basic n-step return buffer.""" def extract_batch(self, T_idxs, B_idxs): """From buffer locations `[T_idxs,B_idxs]`, extract data needed for training, including target values at `T_idxs + n_step_return`. Returns namedarraytuple of torch tensors (see file for all fields). Each tensor has leading batch dimension ``len(T_idxs)==len(B_idxs)``, but individual samples are drawn, so no leading time dimension.""" s = self.samples target_T_idxs = (T_idxs + self.n_step_return) % self.T batch = SamplesFromReplay( agent_inputs=AgentInputs( observation=self.extract_observation(T_idxs, B_idxs), prev_action=s.action[T_idxs - 1, B_idxs], prev_reward=s.reward[T_idxs - 1, B_idxs], ), action=s.action[T_idxs, B_idxs], return_=self.samples_return_[T_idxs, B_idxs], done=self.samples.done[T_idxs, B_idxs],
from dreamer.utils.module import get_parameters, FreezeParameters torch.autograd.set_detect_anomaly(True) # used for debugging gradients loss_info_fields = ['model_loss', 'actor_loss', 'value_loss', 'prior_entropy', 'post_entropy', 'divergence', 'reward_loss', 'image_loss', 'bisim_loss', 'pcont_loss'] LossInfo = namedarraytuple('LossInfo', loss_info_fields) OptInfo = namedarraytuple("OptInfo", ['loss', 'grad_norm_model', 'grad_norm_actor', 'grad_norm_value'] + loss_info_fields) class Dreamer(RlAlgorithm): def __init__( self, # Hyper-parameters batch_size=50, batch_length=50, train_every=1000, train_steps=100,
import torch import torch.distributions as td import torch.nn as nn import torch.nn.functional as tf from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.buffer import buffer_method from dreamer.utils.module import FreezeParameters RSSMState = namedarraytuple('RSSMState', ['mean', 'std', 'stoch', 'deter']) def stack_states(rssm_states: list, dim): return RSSMState( torch.stack([state.mean for state in rssm_states], dim=dim), torch.stack([state.std for state in rssm_states], dim=dim), torch.stack([state.stoch for state in rssm_states], dim=dim), torch.stack([state.deter for state in rssm_states], dim=dim), ) def get_feat(rssm_state: RSSMState): return torch.cat((rssm_state.stoch, rssm_state.deter), dim=-1) def get_dist(rssm_state: RSSMState): return td.independent.Independent( td.Normal(rssm_state.mean, rssm_state.std), 1) class TransitionBase(nn.Module):
import torch from rlpyt.distributions.base import Distribution from rlpyt.distributions.discrete import DiscreteMixin from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.tensor import valid_mean, select_at_indexes EPS = 1e-8 DistInfo = namedarraytuple("DistInfo", ["prob"]) class Categorical(DiscreteMixin, Distribution): def kl(self, old_dist_info, new_dist_info): p = old_dist_info.prob q = new_dist_info.prob return torch.sum(p * (torch.log(p + EPS) - torch.log(q + EPS)), dim=-1) def mean_kl(self, old_dist_info, new_dist_info, valid=None): return valid_mean(self.kl(old_dist_info, new_dist_info), valid) def sample(self, dist_info): p = dist_info.prob sample = torch.multinomial(p.view(-1, self.dim), num_samples=1) return sample.view(p.shape[:-1]).type(self.dtype) # Returns indexes. def entropy(self, dist_info): p = dist_info.prob return -torch.sum(p * torch.log(p + EPS), dim=-1) def log_likelihood(self, indexes, dist_info):
import math from rlpyt.replays.sequence.n_step import (SequenceNStepReturnBuffer, SamplesFromReplay) from rlpyt.replays.async_ import AsyncReplayBufferMixin from rlpyt.replays.sum_tree import SumTree, AsyncSumTree from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.buffer import torchify_buffer, numpify_buffer SamplesFromReplayPri = namedarraytuple( "SamplesFromReplayPri", SamplesFromReplay._fields + ("is_weights", )) class PrioritizedSequenceReplay: """Prioritized experience replay of sequences using sum-tree prioritization. The size of the sum-tree is based on the number of RNN states stored, since valid sequences must start with an RNN state. Hence using periodic storage with ``rnn_state_inveral>1`` results in a faster tree using less memory. Replay buffer priorities are indexed to the start of the whole sequence to be returned, regardless of whether the initial part is used only as RNN warmup. Requires ``batch_T`` to be set and fixed at instantiation, so that the priority tree has a fixed scheme for which samples are temporarilty invalid due to the looping cursor (the tree must set and propagate 0-priorities for those samples, so dynamic ``batch_T`` could require additional tree operations for every sampling event). Parameter ``input_priority_shift`` is used to assign input priorities to a starting time-step which is shifted from the samples input to ``append_samples()``. For example, in R2D1, using replay sequences of 120
AsyncUniformSequenceReplayFrameBuffer, PrioritizedSequenceReplayFrameBuffer from rlpyt.utils.buffer import torchify_buffer, numpify_buffer from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.misc import extract_sequences import traceback Transition = recordclass('Transition', ('timestep', 'state', 'action', 'reward', 'value', 'policy', 'nonterminal')) blank_trans = Transition(0, torch.zeros(84, 84, dtype=torch.uint8), 0, 0., 0., 0, False) # TODO: Set appropriate default policy value blank_batch_trans = Transition(0, torch.zeros(1, 84, 84, dtype=torch.uint8), 0, 0., 0., 0, False) PrioritizedSamples = namedarraytuple("PrioritizedSamples", ["samples", "priorities"]) SamplesToBuffer = namedarraytuple( "SamplesToBuffer", ["observation", "action", "reward", "done", "policy_probs", "value"]) EPS = 1e-6 def samples_to_buffer(observation, action, reward, done, policy_probs, value, priorities=None): samples = SamplesToBuffer(observation=observation, action=action,
import torch from rlpyt.algos.pg.base import PolicyGradientAlgo, OptInfo from rlpyt.agents.base import AgentInputs, AgentInputsRnn from rlpyt.utils.tensor import valid_mean from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.buffer import buffer_to, buffer_method from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.misc import iterate_mb_idxs LossInputs = namedarraytuple("LossInputs", [ "agent_inputs", "action", "return_", "advantage", "valid", "old_dist_info" ]) class PPO(PolicyGradientAlgo): """ Proximal Policy Optimization algorithm. Trains the agent by taking multiple epochs of gradient steps on minibatches of the training data at each iteration, with advantages computed by generalized advantage estimation. Uses clipped likelihood ratios in the policy loss. """ def __init__( self, discount=0.99, learning_rate=0.001, value_loss_coeff=1., entropy_loss_coeff=0.01, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=1.,
import multiprocessing as mp from rlpyt.agents.base import AgentInputs from rlpyt.samplers.parallel.base import ParallelSamplerBase from rlpyt.samplers.parallel.gpu.action_server import ActionServer from rlpyt.samplers.parallel.gpu.collectors import (GpuResetCollector, GpuEvalCollector) from rlpyt.utils.collections import namedarraytuple, AttrDict from rlpyt.utils.synchronize import drain_queue from rlpyt.utils.buffer import buffer_from_example, torchify_buffer StepBuffer = namedarraytuple( "StepBuffer", ["observation", "action", "reward", "done", "agent_info"]) class GpuSamplerBase(ParallelSamplerBase): """Base class for parallel samplers which use worker processes to execute environment steps on CPU resources but the master process to execute agent forward passes for action selection, presumably on GPU. Use GPU-based collecter classes. In addition to the usual batch buffer for data samples, allocates a step buffer over shared memory, which is used for communication with workers. The step buffer includes `observations`, which the workers write and the master reads, and `actions`, which the master write and the workers read. (The step buffer has leading dimension [`batch_B`], for the number of parallel environments, and each worker gets its own slice along that dimension.) The step buffer object is held in both numpy array and torch tensor forms over the same memory; e.g. workers write to the numpy array form, and the agent is able to read the torch tensor form.
import torch import torch.nn as nn from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims from rlpyt.utils.collections import namedarraytuple from rlpyt.models.conv2d import Conv2dHeadModel from rlpyt.models.mlp import MlpModel from rlpyt.models.dqn.dueling import DuelingHeadModel RnnState = namedarraytuple("RnnState", ["h"]) class GRUModel(torch.nn.Module): """2D convolutional neural network (for multiple video frames per observation) feeding into an GRU and MLP output for Q-value outputs for the action set. Ability to track intermediate variables""" def __init__( self, image_shape, output_size, fc_size=512, # Between conv and lstm. lstm_size=512, head_size=512, use_recurrence=True, dueling=False, use_maxpool=False, channels=None, # None uses default. kernel_sizes=None, strides=None, paddings=None, ):
import torch from qec.vmpo.v_mpo import VMPO, OptInfo from rlpyt.agents.base import AgentInputs, AgentInputsRnn from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.buffer import buffer_to, buffer_method from rlpyt.utils.collections import namedarraytuple, namedtuple from qec.vmpo.on_policy_replay import AsyncUniformSequenceReplayBuffer LossInputs = namedarraytuple("LossInputs", ["dist_info", "value", "action", "return_", "advantage", "valid", "old_dist_info"]) SamplesToBuffer = namedarraytuple("SamplesToBuffer", ['agent_inputs', "action", "reward", "done", "dist_info"]) SamplesToBufferTl = namedarraytuple("SamplesToBufferTl", SamplesToBuffer._fields + ("timeout",)) SamplesToBufferRnn = namedarraytuple("SamplesToBufferRnn", SamplesToBuffer._fields + ("prev_rnn_state",)) OptInfo = namedarraytuple("OptInfo", OptInfo._fields + ("optim_buffer_wait_time",)) class AsyncVMPO(VMPO): opt_info_fields = tuple(f for f in OptInfo._fields) # copy def __init__( self, batch_B=64, batch_T=40, **kwargs ): super().__init__(**kwargs)
from rlpyt.algos.dqn.dqn import DQN, SamplesToBuffer from rlpyt.agents.base import AgentInputs from rlpyt.utils.quick_args import save__init__args from rlpyt.utils.logging import logger from rlpyt.utils.collections import namedarraytuple from rlpyt.replays.sequence.frame import ( UniformSequenceReplayFrameBuffer, PrioritizedSequenceReplayFrameBuffer, AsyncUniformSequenceReplayFrameBuffer, AsyncPrioritizedSequenceReplayFrameBuffer) from rlpyt.utils.tensor import select_at_indexes, valid_mean from rlpyt.algos.utils import valid_from_done, discount_return_n_step from rlpyt.utils.buffer import buffer_to, buffer_method, torchify_buffer OptInfo = namedtuple("OptInfo", ["loss", "gradNorm", "tdAbsErr", "priority"]) SamplesToBufferRnn = namedarraytuple( "SamplesToBufferRnn", SamplesToBuffer._fields + ("prev_rnn_state", )) PrioritiesSamplesToBuffer = namedarraytuple("PrioritiesSamplesToBuffer", ["priorities", "samples"]) class R2D1(DQN): """Recurrent-replay DQN with options for: Double-DQN, Dueling Architecture, n-step returns, prioritized_replay.""" opt_info_fields = tuple(f for f in OptInfo._fields) # copy def __init__( self, discount=0.997, batch_T=80, batch_B=64,
from rlpyt.replays.non_sequence.frame import ( PrioritizedReplayFrameBuffer, UniformReplayFrameBuffer, ) from rlpyt.replays.sum_tree import SumTree from rlpyt.utils.buffer import ( buffer_from_example, buffer_func, get_leading_dims, torchify_buffer, ) from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.misc import extract_sequences SamplesFromReplay = namedarraytuple( "SamplesFromReplay", ["observation", "action", "reward", "done"] ) class RlWithUlUniformReplayBuffer(BaseReplayBuffer): def __init__(self, example, size, B, replay_T): self.T = T = math.ceil(size / B) self.B = B self.size = T * B self.t = 0 # cursor self.replay_T = replay_T self.samples = buffer_from_example(example, (T, B), share_memory=self.async_) self._buffer_full = False def append_samples(self, samples): T, B = get_leading_dims(samples, n_dim=2)