def __init__(self, observation_space, action_space, name="MBIE Agent", params={}, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name, params=dict(MBIE_DEFAULTS, **params)) # Policy Setup if starting_policy: self.predict_policy = starting_policy else: self.predict_policy = DiscreteTabularPolicy(self.observation_space, self.action_space, default_value=1 / (1 - self.gamma)) self.backup_lim = int( np.log(1 / (self.params['epsilon_one'] * (1 - self.gamma))) / (1 - self.gamma)) self.policy_iterations = 0 # Model Setup self.model = DiscreteTabularModel( observation_space, action_space, default_reward=self.params['max_reward'], limit=self.params['known_threshold']) self.learn_policy = self.predict_policy
def end_of_episode(self): iterate_policy(self.policy, self.model, states=self.model.get_known_states(), num_steps=self.episodic_backup_steps, gamma=self.gamma) BaseAgent.end_of_episode(self)
def __init__(self, observation_space, action_space, name="RMax Agent", params=None, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name) # Hyper-parameters self.params = dict(RMAX_DEFAULTS) if params: for key, value in params: self.params[key] = value self.max_reward = self.params['max_reward'] self.epsilon_one = self.params['epsilon_one'] self.known_threshold = self.params['known_threshold'] self.gamma = self.params['gamma'] #self.max_reward = 1 / (1 - self.gamma) # Policy Setup self.starting_policy = starting_policy self.backup_lim = int( np.log(1 / (self.epsilon_one * (1 - self.gamma))) / (1 - self.gamma)) self.stepwise_backup_steps = 1 # self.backup_lim self.episodic_backup_steps = min(self.backup_lim, 5) # Model Setup self.model = KnownTabularModel(action_space.n, self.max_reward, self.known_threshold) self.reset()
def __init__(self, observation_space, action_space, name="Q-Learning Agent", parameters={}, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name, params=dict(QLEARNING_CONSTS, **parameters)) # Policy Setup if starting_policy: self.predict_policy = starting_policy else: self.predict_policy = DiscreteTabularPolicy(self.observation_space, self.action_space, default_value=1/(1-self.gamma)) self.learn_policy = EpsilonGreedy( action_space=self.action_space, policy=self.predict_policy, epsilon=self.epsilon )
def learn(self, state, reward, done=False): """ :param state: :param reward: :param done: :return: """ action = self.policy.get_max_action(state) # Exploit learned values self.update(self.prev_state, self.prev_action, reward, state) self.prev_action = action BaseAgent.learn(self, state, reward, done) return action
def __init__(self, observation_space, action_space, name="UCBVI Agent", params=None, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name) # Hyper-parameters self.params = dict(UCBVI_DEFAULTS) if params: for key, value in params: self.params[key] = value self.max_reward = self.params['max_reward'] self.epsilon_one = self.params['epsilon_one'] self.known_threshold = self.params['known_threshold'] self.gamma = self.params['gamma'] #self.max_reward = 1 / (1 - self.gamma) self.delta = self.params['delta'] # Policy Setup self.starting_policy = starting_policy self.backup_lim = int( np.log(1 / (self.epsilon_one * (1 - self.gamma))) / (1 - self.gamma)) self.stepwise_backup_steps = 0 self.episodic_backup_steps = self.backup_lim self.policy_iterations = 0 # Model Setup self.model = DiscreteTabularModel(observation_space, action_space, default_reward=self.max_reward, limit=self.known_threshold) # Experience Tracking self.last_episode = [] # self.last_episode_model = KnownTabularModel(action_space.n, self.max_reward, 1) self.reset()
def end_of_episode(self): self.policy.reset_values() if self.episode_learn_steps: self.vectorized_iterate_policy(num_steps=len(self.last_episode)) self.policy_iterations += len(self.last_episode) BaseAgent.end_of_episode(self)