def _compute_grad_exact():
            if 'env' not in self.__dict__:
                import lqg1d
                self.env = lqg1d.LQG1D()

            sigma = pol.sigma

            M = self.env.max_pos
            ENV_GAMMA = self.env.gamma
            ENV_VOLUME = 2 * self.env.max_action
            ENV_R = np.asscalar(self.env.R)
            ENV_Q = np.asscalar(self.env.Q)
            ENV_B = np.asscalar(self.env.B)
            ENV_MAX_ACTION = self.env.max_action

            MAX_REWARD = ENV_Q * M**2 + ENV_R * ENV_MAX_ACTION**2

            C1 = (1 - ENV_GAMMA)**3 * math.sqrt(2 * math.pi)
            C2 = ENV_GAMMA * math.sqrt(2 * math.pi) * MAX_REWARD * M**2
            C3 = 2 * (1 - ENV_GAMMA) * ENV_VOLUME * MAX_REWARD * M**2

            m = 1

            # c = utils.computeLoss(MAX_REWARD, M, ENV_GAMMA, ENV_VOLUME, sigma)
            c = pol.penaltyCoeff(MAX_REWARD, M, ENV_GAMMA, ENV_VOLUME)
            # d = utils.computeLossSigma(MAX_REWARD, M, ENV_GAMMA, ENV_VOLUME, sigma)

            alphaStar = 1 / (2 * c)

            gradK = self.env.grad_K(np.asscalar(pol.theta_mat), sigma)
            gradMixed = self.env.grad_mixed(np.asscalar(pol.theta_mat), sigma)

            grad_sigma_alpha_star = sigma**2 * (
                2 * C1 * C2 * sigma + 3 * C1 * C3) / (m * (C2 * sigma + C3)**2)
            grad_sigma_norm_grad_theta = 2 * gradK * gradMixed

            # Compute the gradient for sigma
            grad_local_step = (1 / 2) * gradK**2 * grad_sigma_alpha_star
            grad_far_sighted = (1 / 2) * alphaStar * grad_sigma_norm_grad_theta

            gradDelta = grad_local_step + grad_far_sighted
            gradDeltaW = gradDelta * math.exp(pol.w)

            return gradMixed, gradDeltaW
    T = states.shape[0]
    exp_bon = np.zeros(T)
    for t in range(0, T):
        s = states[t]
        a = actions[t]
        x = bisect.bisect(S_space, s)
        y = bisect.bisect(S_space, a)
        N[x, y] += 1
        exp_bon[t] = beta * np.sqrt(1 / N[x, y])
    return exp_bon


#####################################################
# Define the environment and the policy
#####################################################
env = lqg1d.LQG1D(initial_state_type='random')

policy = policy_gaussian

#####################################################
# Experiments parameters
#####################################################
# We will collect N trajectories per iteration
N = 60
# Each trajectory will have at most T time steps
T = 100
# Number of policy parameters updates
n_itr = 100
# Number of epochs
epochs = 5
# Set the discount factor for the problem