예제 #1
0
class Strategy:
    def __init__(self, γ, α, λ, ε, ε_decay, actions):
        self.γ = γ
        self.α = α
        self.λ = λ
        self.ε = ε
        self.ε_decay = ε_decay
        self.actions = actions
        self.eligibility_traces = None
        self.q_values = QValues(actions)
        self.scores = []  # TODO
        self.episode = 0
        self.episode_reward = 0
        self.episode_reward_total = 0  # TODO

    def new_episode(self):
        self.eligibility_traces = EligibilityTraces(1 - self.γ * self.λ)
        self.ε *= self.ε_decay
        self.episode += 1
        self.episode_reward = 0

    def next_action(self, state, ε=None):
        return self.q_values.get_greedy_action(state,
                                               self.ε if ε is None else ε)

    def update(self, state_before, action, reward, state_after):
        expected_reward = self.q_values.get_expected_reward(
            state_before, action)
        next_action = self.q_values.get_greedy_action(state_after, self.ε)
        next_expected_reward = self.q_values.get_expected_reward(
            state_after, next_action)

        td_error = reward - expected_reward + self.γ * next_expected_reward

        self.eligibility_traces.increment(state_before, action)
        self.q_values.ensure_exists(state_before, action)

        def update_q_values(state, action):
            old_expected_reward = self.q_values.get_expected_reward(
                state, action)
            new_expected_reward = old_expected_reward + self.α * td_error * self.eligibility_traces.get(
                state, action)
            self.q_values.set_expected_reward(state, action,
                                              new_expected_reward)
            self.eligibility_traces.decay(state, action)

        self.q_values.for_each(update_q_values)
        self.episode_reward += reward

    def load(self, values):
        self.q_values.set_all_values(values['q'])
        self.ε = values['ε']
        self.scores = values['scores']
        self.episode = values['episode']

    def dump(self):
        return {
            'q': self.q_values.get_all_values(),
            'ε': self.ε,
            'scores': self.scores,
            'episode': self.episode
        }