pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0]
print("pi_star_perf: " + str(pi_star_perf))

# Place to save the results
filename = 'results/' + expname + '/results_' + str(index)

results = []
if not os.path.isdir('results'):
	os.mkdir('results')
if not os.path.isdir('results/' + expname):
	os.mkdir('results/' + expname)

while True:
	for nb_trajectories in nb_trajectories_list:
		# Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples
		trajectories, batch_traj = spibb_utils.generate_batch(nb_trajectories, garnet, pi_behavioural)
		spibb_utils.prt("GENERATED A DATASET OF " + str(nb_trajectories) + " TRAJECTORIES")

		# Compute the maximal likelihood model for transitions and rewards.
		# NB: the true reward function can be used for ease of implementation since it is not stochastic in our environment.
		# One should compute it from the samples when it is stochastic.
		model = modelTransitions.ModelTransitions(batch_traj, nb_states, nb_actions)
		reward_model = spibb_utils.get_reward_model(model.transitions, reward_current)

		# Computes the RL policy
		rl = spibb.spibb(gamma, nb_states, nb_actions, pi_b, mask_0, model.transitions, reward_model, 'default')
		rl.fit()
		# Evaluates the RL policy performance
		perfrl = spibb.policy_evaluation_exact(rl.pi, r_reshaped, current_proba, gamma)[0][0]
		print("perf RL: " + str(perfrl))
                true_rl.pi, r_reshaped, current_proba, gamma)[0][0]
            print("Optimal perf in easter egg environment:\t\t\t" +
                  str(pi_star_perf))
            pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped,
                                                      current_proba,
                                                      gamma)[0][0]
            print("Baseline perf in easter egg environment:\t\t\t" +
                  str(pi_b_perf))
        else:
            easter_egg = None
            r_reshaped = spibb_utils.get_reward_model(current_proba,
                                                      reward_current)

        for nb_trajectories in nb_trajectories_list:
            # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples
            trajectories, batch_traj = spibb_utils.generate_batch(
                nb_trajectories, garnet, pi_b, easter_egg)
            print("GENERATED A DATASET OF " + str(nb_trajectories) +
                  " TRAJECTORIES")

            # Compute the maximal likelihood model for transitions and rewards.
            # NB: the true reward function can be used for ease of implementation since it is not stochastic in our environment.
            # One should compute it from the samples when it is stochastic.
            model = modelTransitions.ModelTransitions(batch_traj, nb_states,
                                                      nb_actions)
            reward_model = spibb_utils.get_reward_model(
                model.transitions, reward_current)

            # Estimates the values of the baseline policy with a monte-carlo estimation from the batch data:
            # q_pib_est = spibb_utils.compute_q_pib_est(gamma, nb_states, nb_actions, trajectories)

            # Computes the RL policy