def _run_one_iteration(self):
        """
        Runs one iteration on the Random MDPs benchmark, so iterates through different baseline and data set parameters
        and then starts the computation for each algorithm.
        """
        path_config = configparser.ConfigParser()
        path_config.read(os.path.join(directory, 'paths.ini'))
        spibb_path = path_config['PATHS']['spibb_path']
        sys.path.append(spibb_path)
        import garnets

        for baseline_target_perf_ratio in self.baseline_target_perf_ratios:
            print(f'Process with seed {self.seed} starting with baseline_target_perf_ratio {baseline_target_perf_ratio}'
                  f' out of {self.baseline_target_perf_ratios}')
            self.garnet = garnets.Garnets(self.nb_states, self.nb_actions, self.nb_next_state_transition,
                                          env_type=self.env_type, self_transitions=self.self_transitions)

            softmax_target_perf_ratio = (baseline_target_perf_ratio + 1) / 2
            self.to_append_run_one_iteration = self.to_append_run + [softmax_target_perf_ratio,
                                                                     baseline_target_perf_ratio]
            self.pi_b, self._q_pi_b, self.pi_star_perf, self.pi_b_perf, self.pi_rand_perf = \
                self.garnet.generate_baseline_policy(self.gamma,
                                                     softmax_target_perf_ratio=softmax_target_perf_ratio,
                                                     baseline_target_perf_ratio=baseline_target_perf_ratio,
                                                     log=self.log)

            self.R_state_state = self.garnet.compute_reward()
            self.P = self.garnet.transition_function
            if self.env_type == 2:  # easter
                self._set_easter_egg(reward=1)
            elif self.env_type == 3:
                self._set_easter_egg(reward=-1)
            else:
                self.easter_egg = None
                self.R_state_action = compute_r_state_action(self.P, self.R_state_state)
            self.to_append_run_one_iteration += [self.pi_b_perf, self.pi_rand_perf, self.pi_star_perf]

            for nb_trajectories in self.nb_trajectories_list:
                print(
                    f'Process with seed {self.seed} starting with nb_trajectories {nb_trajectories} out of '
                    f'{self.nb_trajectories_list}')
                # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples
                self.data, batch_traj = self.generate_batch(nb_trajectories, self.garnet, self.pi_b,
                                                            easter_egg=self.easter_egg)

                self.to_append = self.to_append_run_one_iteration + [nb_trajectories]
                self._run_algorithms()
Q_baseline = np.load(npy_filename)

# Compute the baseline policy:
pi_b = spibb_utils.compute_baseline(Q_baseline)

pi_behavioural = np.ones(pi_b.shape)/nb_actions


# The batch sizes:
nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
N_wedges = [5,7,10,15,20,30,50,70,100]
v = np.zeros(nb_states)

# Pre-compute the true reward function in function of SxA:
current_proba = maze.transition_function
garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0)
garnet.transition_function = current_proba
reward_current = garnet.compute_reward()
r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current)


# Compute the baseline policy performance:
pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0]
print("baseline_perf: " + str(pi_b_perf))


# Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms
mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, [])
mask_0 = ~mask_0

pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default')
Пример #3
0
mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, [])
mask_0 = ~mask_0
rand_pi = np.ones((nb_states, nb_actions)) / nb_actions

filename = 'results/' + expname + '/results_' + str(index)

results = []
if not os.path.isdir('results'):
    os.mkdir('results')
if not os.path.isdir('results/' + expname):
    os.mkdir('results/' + expname)

while True:
    for ratio in ratios:
        garnet = garnets.Garnets(nb_states,
                                 nb_actions,
                                 nb_next_state_transition,
                                 self_transitions=0)

        softmax_target_perf_ratio = (ratio + 1) / 2
        baseline_target_perf_ratio = ratio
        pi_b, q_pi_b, pi_star_perf, pi_b_perf, pi_rand_perf = \
              garnet.generate_baseline_policy(gamma,
                      softmax_target_perf_ratio=softmax_target_perf_ratio,
                      baseline_target_perf_ratio=baseline_target_perf_ratio)

        reward_current = garnet.compute_reward()
        current_proba = garnet.transition_function
        r_reshaped = spibb_utils.get_reward_model(current_proba,
                                                  reward_current)

        for nb_trajectories in nb_trajectories_list:
Пример #4
0
        for action in range(nb_actions):
            if count_state_action[state, action] == 0:
                errors[state, action] = unvisited
            else:
                errors[state, action] = np.sqrt(
                    2 * (np.log(2 * (nb_states * nb_actions * 2**nb_actions) /
                                delta)) / count_state_action[state, action])
    return errors


results = []

for ratio in ratios:
    garnet = garnets.Garnets(nb_states,
                             nb_actions,
                             nb_next_state_transition,
                             env_type=env_type,
                             self_transitions=self_transitions)

    softmax_target_perf_ratio = (ratio + 1) / 2
    baseline_target_perf_ratio = ratio
    pi_b, q_pi_b, pi_star_perf, pi_b_perf, pi_rand_perf = \
        garnet.generate_baseline_policy(gamma,
                                        softmax_target_perf_ratio=softmax_target_perf_ratio,
                                        baseline_target_perf_ratio=baseline_target_perf_ratio, log=False)

    reward_current = garnet.compute_reward()
    current_proba = garnet.transition_function
    r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current)
    results_traj = []