def _learn(self, g, importance_sampling_ratio): # importance_sampling_ratio = 1 # print(importance_sampling_ratio) log.debug('isr: {}'.format(importance_sampling_ratio)) if importance_sampling_ratio == 0: return self.c += importance_sampling_ratio self.q = self.q + self.get_learning_rate(importance_sampling_ratio, self.c) * (g - self.q) log.debug('q:{} '.format(self.q))
def cache_reward(self, reward, time_step=9e20, **kwargs): one_step_importance_sampling_ratio = kwargs[ 'one_step_importance_sampling_ratio'] log.debug('isr: {:.2f} ->'.format(self.importance_sampling_ratio)) self.importance_sampling_ratio *= one_step_importance_sampling_ratio log.debug('isr: {:.2f}'.format(self.importance_sampling_ratio)) if self.initial_time_step <= time_step: # print('cache {} {} {}'.format(self.initial_time_step, time_step, reward)) self.reward_cache += reward * (self.discount_factor** self.reward_cache_count) self.reward_cache_count += 1
def update(self, reward_calculator, next_actions, **kwargs): time_step = kwargs['time_step'] evaluated_action_value = 0 if next_actions: next_action = GreedyPolicy().pick_action(next_actions) evaluated_action_value = next_action.evaluate() reward_calculator = self.reward_calculators[time_step] g = reward_calculator.get_reward( ) + reward_calculator.get_next_discount() * evaluated_action_value log.debug('g: {}'.format(g)) self._learn(g, reward_calculator.get_importance_sampling_ratio()) del self.reward_calculators[time_step]
def execute_one_episode(self): log.debug('start') while True: log.debug(self.current_stat) is_end, reward = self.play() log.debug('reward={}'.format(reward)) if is_end: log.debug('end') return reward
def run(self, num_episodes, discount_factor=1, epsilon=0.1, learning_rate=0.5): self.env = Env(self.gym_env, discount_factor, epsilon, action_type=NStepAction, learning_rate=learning_rate) stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in tqdm(range(num_episodes)): states = list() state = self.env.reset() env_list = list() T = 1e10 update_time = -1 log.debug('----------------------\n\n') for t in itertools.count(): if t < T: policy = EGreedyPolicy(epsilon) action_state = state.get_next_action_state(policy) b = state.get_action_probability(policy, action_state) pi = state.get_action_probability(EGreedyPolicy(0.01), action_state) ratio = pi / b log.debug('s:{}'.format(state)) log.debug('a:{}'.format(action_state)) log.debug('b:{0:.2f} pi:{1:.2f} ra:{2:.2f}'.format( b, pi, ratio)) action_state.add_reward_calculator(t) # self.env.render() next_state, reward, done, _ = self.env.step( action_state.get_gym_action()) log.debug('done: {} reward: {}'.format(done, reward)) env_list.append((state, action_state)) states.append(next_state) accumulated_time = 0 if update_time + 1 < 0 else update_time + 1 for s, a_s in env_list[accumulated_time:]: log.debug('cache for s:{} a:{}'.format(s, a_s)) a_s.cache_reward( reward, step=t, one_step_importance_sampling_ratio=ratio) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: T = t + 1 else: state = next_state update_time = t - self.n + 1 if update_time >= 0: action_state_update_time = env_list[update_time][1] evaluated_state_index = update_time + self.n - 1 if evaluated_state_index < len(states): log.debug('=n') state_update_time = states[evaluated_state_index] action_state_update_time.update( 0, state_update_time.get_actions(), time_step=update_time) else: log.debug('<n') action_state_update_time.update(0, None, time_step=update_time) if update_time == T - 1: a_ss = [a_s for _, a_s in env_list] for a_s in a_ss: a_s.clear_reward_calculator() break return stats
def choose_next_policy(self): self.next_policy = self._choose_next_action() log.debug('choose:{}'.format(self.next_policy))
def get_reward(self) -> int: dealer_sum = self.get_dealer_score() log.debug('dealer:{} --- player:{}'.format(dealer_sum, self.player_sum)) return self._get_reward(self.player_sum, dealer_sum)
def _get_next_action(self): action = self._choose_next_action() log.debug('choose:{}'.format(self.next_policy)) return action
def choose_random_policy(self): actions = self.available_actions self.next_policy = actions[random.randint(0, len(actions) - 1)] log.debug('choose random:{}'.format(self.next_policy))
def update_reward(self, reward): log.debug('start update') for state in self.state_gone_through: state.update_reward(reward) log.debug(state) log.debug('end update')