def update(self, reward_calculator, next_actions, **kwargs): time_step = kwargs['time_step'] evaluated_action_value = 0 if next_actions: next_action = GreedyPolicy().pick_action(next_actions) evaluated_action_value = next_action.evaluate() reward_calculator = self.reward_calculators[time_step] g = reward_calculator.get_reward( ) + reward_calculator.get_next_discount() * evaluated_action_value self.learn(g) del self.reward_calculators[time_step]
def update(self, reward_calculator, next_actions, **kwargs): time_step = kwargs['time_step'] evaluated_action_value = 0 if next_actions: next_action = GreedyPolicy().pick_action(next_actions) evaluated_action_value = next_action.evaluate() reward_calculator = self.reward_calculators[time_step] g = reward_calculator.get_reward( ) + reward_calculator.get_next_discount() * evaluated_action_value log.debug('g: {}'.format(g)) self._learn(g, reward_calculator.get_importance_sampling_ratio()) del self.reward_calculators[time_step]
def test_should_return_correct_probabilities(self): action1 = Mock() action2 = Mock() action1.evaluate = MagicMock(return_value=1) action2.evaluate = MagicMock(return_value=2) actual = GreedyPolicy(0.1).action_to_probability([action1, action2]) self.assertAlmostEqual(1, actual[action2]) self.assertAlmostEqual(0, actual[action1])
def show_one_episode(self): state = self.env.reset() for t in itertools.count(): self.env.render() action_state = state.get_next_action_state( GreedyPolicy(self.action_evaluator)) next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) if done: self.env.render() break state = next_state
def run(self, num_episodes, discount_factor=0.8, epsilon=0.1): self.env = Env(self.gym_env, discount_factor, epsilon, action_type=McOfflineAction) n = self.env.env.action_space.n for _ in tqdm(range(num_episodes)): action_states = self.generate_one_episode_action_states_by_policy(RandomPolicy()) w = 1 g = 0 for action_state in reversed(action_states): state, action_state, reward = action_state g = discount_factor * g + reward action_state.update_c(w) action_state.update_q(g, w) action = state.get_next_action_state(GreedyPolicy()) if action != action_state: break w = w * n return state
def update(self, reward, next_actions, **kwargs): next_q_value = GreedyPolicy().pick_action(next_actions).evaluate() # self.q = self.q + self.learning_rate * (reward + self.discount_factor * next_q_value - self.q) g = reward + self.discount_factor * next_q_value self.learn(g)
def get_result(self): return self.state, GreedyPolicy().pick_action( self.available_actions).get_gym_action()
def update_q2(self, reward, next_actions): next_action_state = GreedyPolicy(lambda action: action.q1).pick_action(next_actions) self.q2 = self.q2 + self.learning_rate * (reward + self.discount_factor * next_action_state.q1 - self.q2) self.learning_rate = self.anneal()