Exemplo n.º 1
0
    def __init__(self,
                 observation_space,
                 action_space,
                 name="MBIE Agent",
                 params={},
                 starting_policy=None):
        BaseAgent.__init__(self,
                           observation_space,
                           action_space,
                           name,
                           params=dict(MBIE_DEFAULTS, **params))

        # Policy Setup
        if starting_policy:
            self.predict_policy = starting_policy
        else:
            self.predict_policy = DiscreteTabularPolicy(self.observation_space,
                                                        self.action_space,
                                                        default_value=1 /
                                                        (1 - self.gamma))
        self.backup_lim = int(
            np.log(1 / (self.params['epsilon_one'] * (1 - self.gamma))) /
            (1 - self.gamma))
        self.policy_iterations = 0

        # Model Setup
        self.model = DiscreteTabularModel(
            observation_space,
            action_space,
            default_reward=self.params['max_reward'],
            limit=self.params['known_threshold'])

        self.learn_policy = self.predict_policy
Exemplo n.º 2
0
 def end_of_episode(self):
     iterate_policy(self.policy,
                    self.model,
                    states=self.model.get_known_states(),
                    num_steps=self.episodic_backup_steps,
                    gamma=self.gamma)
     BaseAgent.end_of_episode(self)
Exemplo n.º 3
0
    def __init__(self,
                 observation_space,
                 action_space,
                 name="RMax Agent",
                 params=None,
                 starting_policy=None):
        BaseAgent.__init__(self, observation_space, action_space, name)

        # Hyper-parameters
        self.params = dict(RMAX_DEFAULTS)
        if params:
            for key, value in params:
                self.params[key] = value
        self.max_reward = self.params['max_reward']
        self.epsilon_one = self.params['epsilon_one']
        self.known_threshold = self.params['known_threshold']
        self.gamma = self.params['gamma']
        #self.max_reward = 1 / (1 - self.gamma)

        # Policy Setup
        self.starting_policy = starting_policy
        self.backup_lim = int(
            np.log(1 / (self.epsilon_one * (1 - self.gamma))) /
            (1 - self.gamma))
        self.stepwise_backup_steps = 1  # self.backup_lim
        self.episodic_backup_steps = min(self.backup_lim, 5)

        # Model Setup
        self.model = KnownTabularModel(action_space.n, self.max_reward,
                                       self.known_threshold)

        self.reset()
Exemplo n.º 4
0
    def __init__(self, observation_space, action_space, name="Q-Learning Agent", parameters={}, starting_policy=None):
        BaseAgent.__init__(self, observation_space, action_space, name, params=dict(QLEARNING_CONSTS, **parameters))

        # Policy Setup
        if starting_policy:
            self.predict_policy = starting_policy
        else:
            self.predict_policy = DiscreteTabularPolicy(self.observation_space, self.action_space, default_value=1/(1-self.gamma))
        self.learn_policy = EpsilonGreedy(
                action_space=self.action_space,
                policy=self.predict_policy,
                epsilon=self.epsilon
            )
Exemplo n.º 5
0
    def learn(self, state, reward, done=False):
        """

        :param state:
        :param reward:
        :param done:
        :return:
        """
        action = self.policy.get_max_action(state)  # Exploit learned values

        self.update(self.prev_state, self.prev_action, reward, state)

        self.prev_action = action
        BaseAgent.learn(self, state, reward, done)
        return action
Exemplo n.º 6
0
    def __init__(self,
                 observation_space,
                 action_space,
                 name="UCBVI Agent",
                 params=None,
                 starting_policy=None):
        BaseAgent.__init__(self, observation_space, action_space, name)

        # Hyper-parameters
        self.params = dict(UCBVI_DEFAULTS)
        if params:
            for key, value in params:
                self.params[key] = value
        self.max_reward = self.params['max_reward']
        self.epsilon_one = self.params['epsilon_one']
        self.known_threshold = self.params['known_threshold']
        self.gamma = self.params['gamma']
        #self.max_reward = 1 / (1 - self.gamma)
        self.delta = self.params['delta']

        # Policy Setup
        self.starting_policy = starting_policy
        self.backup_lim = int(
            np.log(1 / (self.epsilon_one * (1 - self.gamma))) /
            (1 - self.gamma))
        self.stepwise_backup_steps = 0
        self.episodic_backup_steps = self.backup_lim
        self.policy_iterations = 0

        # Model Setup
        self.model = DiscreteTabularModel(observation_space,
                                          action_space,
                                          default_reward=self.max_reward,
                                          limit=self.known_threshold)

        # Experience Tracking
        self.last_episode = []
        # self.last_episode_model = KnownTabularModel(action_space.n, self.max_reward, 1)

        self.reset()
Exemplo n.º 7
0
 def end_of_episode(self):
     self.policy.reset_values()
     if self.episode_learn_steps:
         self.vectorized_iterate_policy(num_steps=len(self.last_episode))
         self.policy_iterations += len(self.last_episode)
     BaseAgent.end_of_episode(self)