def __init__(self, env, n_samples=10, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs): # initialize base class assert env.is_generative(), \ "MBQVI requires a generative model." assert isinstance(env.observation_space, Discrete), \ "MBQVI requires a finite state space." assert isinstance(env.action_space, Discrete), \ "MBQVI requires a finite action space." Agent.__init__(self, env, **kwargs) # self.n_samples = n_samples self.gamma = gamma self.horizon = horizon self.epsilon = epsilon # empirical MDP, created in fit() self.R_hat = None self.P_hat = None # value functions self.V = None self.Q = None
def __init__(self, env, **kwargs): """ Parameters ---------- env : Model Environment used to fit the agent. """ Agent.__init__(self, env, **kwargs)
def __init__(self, env, n_episodes=1000, horizon=100, gamma=0.99, batch_size=16, percentile=70, learning_rate=0.01, optimizer_type='ADAM', policy_net_fn=None, **kwargs): Agent.__init__(self, env, **kwargs) # check environment assert isinstance(self.env.observation_space, spaces.Box) assert isinstance(self.env.action_space, spaces.Discrete) # parameters self.gamma = gamma self.batch_size = batch_size self.n_episodes = n_episodes self.percentile = percentile self.learning_rate = learning_rate self.horizon = horizon # random number generator self.rng = seeding.get_rng() # self.policy_net_fn = policy_net_fn \ or (lambda: default_policy_net_fn(self.env)) self.optimizer_kwargs = {'optimizer_type': optimizer_type, 'lr': learning_rate} # policy net self.policy_net = self.policy_net_fn().to(device) # loss function and optimizer self.loss_fn = nn.CrossEntropyLoss() self.optimizer = optimizer_factory( self.policy_net.parameters(), **self.optimizer_kwargs) # memory self.memory = CEMMemory(self.batch_size) # default writer self.writer = PeriodicWriter(self.name, log_every=5*logger.getEffectiveLevel())
def __init__(self, env, policy, learning_rate=7e-4, n_steps: int = 5, gamma: float = 0.99, gae_lambda: float = 1.0, ent_coef: float = 0.0, vf_coef: float = 0.5, max_grad_norm: float = 0.5, rms_prop_eps: float = 1e-5, use_rms_prop: bool = True, use_sde: bool = False, sde_sample_freq: int = -1, normalize_advantage: bool = False, tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose: int = 0, seed=None, device="auto", _init_setup_model: bool = True, **kwargs): # Generate seed for A2CStableBaselines using rlberry seeding self.rng = seeding.get_rng() seed = self.rng.integers(2**32).item() # init stable baselines class self.wrapped = A2CStableBaselines( policy, env, learning_rate, n_steps, gamma, gae_lambda, ent_coef, vf_coef, max_grad_norm, rms_prop_eps, use_rms_prop, use_sde, sde_sample_freq, normalize_advantage, tensorboard_log, create_eval_env, policy_kwargs, verbose, seed, device, _init_setup_model) # init rlberry base class Agent.__init__(self, env, **kwargs)
def __init__(self, env, n_episodes=1000, gamma=0.95, horizon=None, lp_metric=2, kernel_type="epanechnikov", scaling=None, bandwidth=0.05, min_dist=0.1, max_repr=1000, bonus_scale_factor=1.0, beta=0.01, bonus_type="simplified_bernstein", **kwargs): # init base class Agent.__init__(self, env, **kwargs) self.n_episodes = n_episodes self.gamma = gamma self.horizon = horizon self.lp_metric = lp_metric self.kernel_type = kernel_type self.bandwidth = bandwidth self.min_dist = min_dist self.bonus_scale_factor = bonus_scale_factor self.beta = beta self.bonus_type = bonus_type # check environment assert self.env.is_online() assert isinstance(self.env.observation_space, spaces.Box) assert isinstance(self.env.action_space, spaces.Discrete) # other checks assert gamma >= 0 and gamma <= 1.0 if self.horizon is None: assert gamma < 1.0, \ "If no horizon is given, gamma must be smaller than 1." self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) # state dimension self.state_dim = self.env.observation_space.shape[0] # compute scaling, if it is None if scaling is None: # if high and low are bounded if (self.env.observation_space.high == np.inf).sum() == 0 \ and (self.env.observation_space.low == -np.inf).sum() == 0: scaling = self.env.observation_space.high \ - self.env.observation_space.low # if high or low are unbounded else: scaling = np.ones(self.state_dim) else: assert scaling.ndim == 1 assert scaling.shape[0] == self.state_dim self.scaling = scaling # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf: logger.warning("{}: Reward range is infinity. ".format(self.name) + "Clipping it to 1.") r_range = 1.0 if self.gamma == 1.0: self.v_max = r_range * horizon else: self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon))\ / (1.0 - self.gamma) # number of representative states and number of actions if max_repr is None: max_repr = int( np.ceil((1.0 * np.sqrt(self.state_dim) / self.min_dist)**self.state_dim)) self.max_repr = max_repr # current number of representative states self.M = None self.A = self.env.action_space.n # declaring variables self.episode = None # current episode self.representative_states = None # coordinates of all repr states self.N_sa = None # sum of weights at (s, a) self.B_sa = None # bonus at (s, a) self.R_hat = None # reward estimate self.P_hat = None # transitions estimate self.Q = None # Q function self.V = None # V function self.Q_policy = None # Q function for recommended policy # initialize self.reset()
def __init__(self, env, horizon, feature_map_fn, feature_map_kwargs=None, n_episodes=100, gamma=0.99, bonus_scale_factor=1.0, reg_factor=0.1, **kwargs): Agent.__init__(self, env, **kwargs) self.horizon = horizon self.n_episodes = n_episodes self.gamma = gamma self.bonus_scale_factor = bonus_scale_factor self.reg_factor = reg_factor feature_map_kwargs = feature_map_kwargs or {} self.feature_map = feature_map_fn(self.env, **feature_map_kwargs) # if self.bonus_scale_factor == 0.0: self.name = 'LSVI-Random-Expl' # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf: logger.warning("{}: Reward range is infinity. ".format(self.name) + "Clipping it to 1.") r_range = 1.0 if self.gamma == 1.0: self.v_max = r_range * horizon else: self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon))\ / (1.0 - self.gamma) # assert isinstance(self.env.action_space, Discrete), \ "LSVI-UCB requires discrete actions." # assert len(self.feature_map.shape) == 1 self.dim = self.feature_map.shape[0] # attributes initialized in reset() self.episode = None self.lambda_mat = None # lambda matrix self.lambda_mat_inv = None # inverse of lambda matrix self.w_vec = None # vector representation of Q self.w_policy = None # representation of Q for final policy self.reward_hist = None # reward history self.state_hist = None # state history self.action_hist = None # action history self.nstate_hist = None # next state history self.feat_hist = None # feature history self.feat_ns_all_actions = None # next state features for all actions # # aux variables (init in reset() too) self._rewards = None # default writer self.writer = PeriodicWriter(self.name, log_every=15) # 5*logger.getEffectiveLevel() # self.reset()
def __init__(self, env, horizon, pd_kernel_fn, pd_kernel_kwargs=None, n_episodes=100, gamma=0.99, bonus_scale_factor=1.0, reg_factor=0.1, **kwargs): Agent.__init__(self, env, **kwargs) self.use_jit = True self.horizon = horizon self.n_episodes = n_episodes self.gamma = gamma self.bonus_scale_factor = bonus_scale_factor self.reg_factor = reg_factor self.total_time_steps = 0 pd_kernel_kwargs = pd_kernel_kwargs or {} self.pd_kernel = pd_kernel_fn # if self.bonus_scale_factor == 0.0: self.name = 'KOVI-Random-Expl' # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf: logger.warning("{}: Reward range is infinity. ".format(self.name) + "Clipping it to 1.") r_range = 1.0 if self.gamma == 1.0: self.v_max = r_range * horizon else: self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon))\ / (1.0 - self.gamma) # assert isinstance(self.env.action_space, Discrete), \ "KOVI requires discrete actions." # attributes initialized in reset() self.episode = None self.gram_mat = None # Gram matrix self.gram_mat_inv = None # inverse of Gram matrix self.alphas = None # vector representations of Q self.reward_hist = None # reward history self.state_hist = None # state history self.action_hist = None # action history self.nstate_hist = None # next state history self.rkhs_norm_hist = None # norm history self.feat_hist = None # feature history self.feat_ns_all_actions = None # next state features for all actions self.new_gram_mat = None self.new_gram_mat_inv = None # # aux variables (init in reset() too) self._rewards = None # default writer self.writer = PeriodicWriter(self.name, log_every=15) # 5*logger.getEffectiveLevel() # self.reset()