Пример #1
0
    def _find_farther_state(self, gamma):
        argmin = -1
        min_value = 1
        rand_value = 0
        best_q_star = 0
        mask_0, thres = spibb.compute_mask(self.nb_states, self.nb_actions, 1,
                                           1, [])
        mask_0 = ~mask_0
        rand_pi = np.ones((self.nb_states, self.nb_actions)) / self.nb_actions
        for final_state in range(1, self.nb_states):
            p, r = self._set_temporary_final_state(final_state)
            r_reshaped = spibb_utils.get_reward_model(p, r)

            rl = spibb.spibb(gamma, self.nb_states, self.nb_actions, mask_0,
                             mask_0, p, r_reshaped, 'default', 0, None, None,
                             None)
            rl.fit()
            v_star, q_star = spibb.policy_evaluation_exact(
                rl.pi, r_reshaped, p, gamma)
            v_rand, q_rand = spibb.policy_evaluation_exact(
                rand_pi, r_reshaped, p, gamma)

            perf_star = v_star[0]
            perf_rand = v_rand[0]

            if perf_star < min_value and perf_star > gamma**50:
                min_value = perf_star
                argmin = final_state
                rand_value = perf_rand
                best_q_star = q_star.copy()

        avg_time_to_goal = np.log(min_value) / np.log(gamma)
        avg_time_to_goal_rand = np.log(rand_value) / np.log(gamma)
        print("Optimal performance : " + str(min_value))
        print("Optimal average time to goal: " + str(avg_time_to_goal))
        print("Random policy performance : " + str(rand_value))
        print("Random policy average time to goal: " +
              str(avg_time_to_goal_rand))

        return argmin, min_value, best_q_star, rand_value
# Pre-compute the true reward function in function of SxA:
current_proba = maze.transition_function
garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0)
garnet.transition_function = current_proba
reward_current = garnet.compute_reward()
r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current)


# Compute the baseline policy performance:
pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0]
print("baseline_perf: " + str(pi_b_perf))


# Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms
mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, [])
mask_0 = ~mask_0

pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default')
pi_star.fit()
pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0]
print("pi_star_perf: " + str(pi_star_perf))

# Place to save the results
filename = 'results/' + expname + '/results_' + str(index)

results = []
if not os.path.isdir('results'):
	os.mkdir('results')
if not os.path.isdir('results/' + expname):
	os.mkdir('results/' + expname)
Пример #3
0
N_wedges = [5, 7, 10, 15, 20]
delta = 1
epsilons = [0.1, 0.2, 0.5, 1, 2, 5]
nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000]

ratios = [0.1, 0.9]

seed = index
np.random.seed(seed)

gamma = 0.95
nb_states = 50
nb_actions = 4
nb_next_state_transition = 4

mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, [])
mask_0 = ~mask_0
rand_pi = np.ones((nb_states,nb_actions)) / nb_actions

filename = 'results/' + expname + '/results_' + str(index)

results = []
if not os.path.isdir('results'):
	os.mkdir('results')
if not os.path.isdir('results/' + expname):
	os.mkdir('results/' + expname)

while True:
	for ratio in ratios:
		garnet = garnets.Garnets(nb_states, nb_actions, nb_next_state_transition, self_transitions=0)