Пример #1
0
    #     plt.tick_params(labelleft='off')

    # general title
    plt.suptitle("Delta-Iterations comparation",
                 fontsize=13,
                 fontweight=0,
                 color="black",
                 style="italic")
    plt.tight_layout()
    plt.show()
    # Axis title
    # plt.text(0.5, 0.02, 'Time', ha='center', va='center')
    # plt.text(0.06, 0.5, 'Note', ha='center', va='center', rotation='vertical')


env = GridWorldEnv()

print("Policy evaluation using policy evaluation")
state_values, _ = dp.policy_evaluation(env=env, discount_factor=0.9)
print("Value States-> \n", state_values)
# print_state_latex(state_values)

policy, state_values, deltas_value, t = dp.value_iteration(env,
                                                           discount_factor=0.9)

print("Optimal policy found using Value Iteration algorithm  [0.9] found in ")
print("Elapsed time: %.5f [sec]" % (t[0]))
print("CPU elapsed time: %.5f [sec]" % (t[1]))
env.render_policy(policy=policy)
print(state_values)
# print_state_latex(state_values)
Пример #2
0
num_iterations = 10000  # @param

initial_collect_steps = 1000  # @param
collect_steps_per_iteration = 1  # @param
replay_buffer_capacity = 100000  # @param

fc_layer_params = (100, )

batch_size = 128  # @param
learning_rate = 1e-5  # @param
log_interval = 200  # @param

num_eval_episodes = 2  # @param
eval_interval = 1000  # @param

train_py_env = wrappers.TimeLimit(GridWorldEnv(), duration=100)
eval_py_env = wrappers.TimeLimit(GridWorldEnv(), duration=100)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

q_net = q_network.QNetwork(train_env.observation_spec(),
                           train_env.action_spec(),
                           fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
Пример #3
0
# https://blog.csdn.net/c602273091/article/details/79008755
import gym
from gridworld import GridWorldEnv
from gym import spaces

env = GridWorldEnv(
    n_width=12,  # 水平方向格子数量
    n_height=4,  # 垂直方向格子数量
    u_size=60,  # 可以根据喜好调整大小
    default_reward=-1,  # 默认格子的即时奖励值
    default_type=0)  # 默认的格子都是可以进入的
env.action_space = spaces.Discrete(4)  # 设置行为空间数量
# 格子世界环境类默认使用0表示左,1:右,2:上,3:下,4,5,6,7为斜向行走
# 具体可参考_step内的定义
# 格子世界的观测空间不需要额外设置,会自动根据传输的格子数量计算得到
env.start = (0, 0)
env.ends = [(11, 0)]
for i in range(10):
    env.rewards.append((i + 1, 0, -100))
    env.ends.append((i + 1, 0))
env.types = [(5, 1, 1), (5, 2, 1)]
env.refresh_setting()
env._reset()
env._render()
input("press any key to continue...")
Пример #4
0
        return action

    def learn(self, state, action, reward, state_, done):

        q_target = reward + self.gamma * np.max(self.q_table[state_, :]) * done
        q_delta = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.alpha * q_delta

        self.epsilon = self.epsilon * self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min


from gridworld import GridWorldEnv

if __name__ == "__main__":
    env = GridWorldEnv()
    n_games = 500
    agent = Agent(alpha=0.6,
                  gamma=1.0,
                  epsilon=0.5,
                  n_states=env.stateCount,
                  n_actions=env.actionCount)
    stateDict = env.stateDict
    scores = []

    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(stateDict.get(observation))
Пример #5
0
from gridworld import GridWorldEnv
from gym import spaces

env = GridWorldEnv(n_width = 12,
                   n_height=4,
                   u_size=60,
                   default_reward = -1,
                   default_type = 0,
                   windy = False)

env.action_space = spaces.Discrete(4)

env.start = (0,0)
env.ends = [(11,0)]

for i in range(10):
    env.rewards.append((i+1,0,-100))
    env.ends.append((i+1,0))

env.types = [(5,1,1),(5,2,1)]

env.refresh_setting()

env.reset()

env.render()
input("press any key to continue...")
Пример #6
0
def main(cfg):
    initial_seed = cfg.initial_seed
    random.seed(initial_seed)
    np.random.seed(initial_seed)
    gamma = cfg.gamma
    n_trajectories = cfg.n_trajectories
    horizon = cfg.horizon
    horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
    processor = lambda x: x
    seed_list = [
        initial_seed + np.random.randint(0, 10000) * i
        for i in range(cfg.n_experiments)
    ]  # generate a list of random seeds
    if cfg.env == 'grid_world':
        from gridworld import GridWorldEnv
        env = GridWorldEnv()
    elif cfg.env == 'taxi':
        from taxi import TaxiEnv
        env = TaxiEnv()

    n_states = env.nS
    n_actions = env.nA
    P = env.P_matrix
    R = env.R_matrix.copy()
    d0 = env.isd

    q_star_original = env.value_iteration()
    # pi_prob = gymEnv.extract_policy(q_star_original, temperature=0.05)
    # mu_prob = gymEnv.extract_policy(q_star_original, temperature=1)
    pi = env.extract_policy(q_star_original, temperature=0.1)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # pi = env.extract_policy(q_star_original, temperature=0.15)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy()
    mu = pi.copy()
    mu[:, 0] = pi[:, 1].copy()
    mu[:, 1] = pi[:, 2].copy()
    mu[:, 2] = pi[:, 3].copy()
    mu[:, 3] = pi[:, 0].copy()

    dpi, dpi_t, v_pi_s, P_pi = exact_calculation(env, pi, cfg.horizon,
                                                 cfg.gamma)
    dmu, dmu_t, vmu_s, P_mu = exact_calculation(env, mu, cfg.horizon,
                                                cfg.gamma)
    #! sanity check the loss objective
    #* verify the claim that L(w*,f) = 0 for all f, where
    #* L(w,f) = \E_{(s,a,s')\sim d_mu} [ w(s) (f(s) - gamm*rho(s,a)*f(s'))] +1/h E_{s\sim d0} [f(s)] - 1/h *gamma^horizon \E_{s\sim d_pi,H}[f(s)]
    # determine w_star
    w_star = np.nan_to_num(dpi / dmu)
    v_pi = np.sum(d0 * v_pi_s)
    v_mu = np.sum(d0 * vmu_s)

    dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1])
    dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1])
    if RUN_SANITY_CHECK:

        def L(w, f):
            loss = 0
            for s in range(n_states):
                for a in range(n_actions):
                    for sn in range(n_states):
                        loss += w[s] * (-f[s] + gamma * pi[s, a] / mu[s, a] *
                                        f[sn]) * dmu[s] * mu[s, a] * P[s, a,
                                                                       sn]

            loss += 1 / horizon_normalization * np.sum(d0 * f)
            loss -= 1 / horizon_normalization * gamma**horizon * np.sum(
                dpi_H * f)
            return loss

        f = np.random.rand(n_states)
        loss = L(w_star, f)
        assert abs(loss) < 1e-8

        #! sanity check bellman and td error
        R_pi = np.sum(R * pi, axis=-1)
        bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s)
        bellman_new = v_pi_s - np.dot(
            (np.identity(n_states) - np.linalg.matrix_power(
                gamma * P_pi, horizon)), R_pi) - gamma * np.dot(P_pi, v_pi_s)
        pdb.set_trace()

    ground_truth_info = AttrDict({})
    ground_truth_info.update({
        'd_pi': torch.tensor(dpi, dtype=dtype),
        'd_mu': torch.tensor(dmu, dtype=dtype),
        'v_pi': torch.tensor(v_pi_s, dtype=dtype),
        'v_star': v_pi
    })
    ground_truth_info.update({'w_pi': w_star})
    ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)})
    ground_truth_info.update({
        'pi': torch.tensor(pi, dtype=dtype),
        'mu': torch.tensor(mu, dtype=dtype)
    })
    true_rho = torch.tensor(pi / mu, dtype=dtype)
    true_rho[true_rho != true_rho] = 0
    true_rho[torch.isinf(true_rho)] = 0
    ground_truth_info.update({'rho': true_rho})
    ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)})
    ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)})
    ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)})
    ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)})
    ground_truth_info.update({
        'd_pi_t': torch.tensor(dpi_t, dtype=dtype),
        'd_mu_t': torch.tensor(dmu_t, dtype=dtype)
    })

    estimate = {}
    squared_error = {}
    estimate.update({'True pi': [float(v_pi)]})
    squared_error.update({'True pi': [0]})
    estimate.update({'True mu': [float(v_mu)]})
    squared_error.update({'True mu': [float(v_mu - v_pi)**2]})

    #* Generate multiple sets of behavior data from mu
    training_data = []
    training_data_processed = []
    for _ in range(cfg.n_experiments):
        print('Experiment:', _)
        print('------------------------')
        np.random.seed(seed_list[_])
        env.seed(seed_list[_])
        # behavior_data = rollout(env, mu, processor, absorbing_state, pi_e = pi, N=n_trajectories, T=horizon, frameskip=1, frameheight=1, path=None, filename='tmp',)
        behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon)
        behavior_data_processed = prepare_behavior_data(behavior_data)
        training_data.append(behavior_data)
        training_data_processed.append(behavior_data_processed)
        # pdb.set_trace()
    estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[
        'STEP WIS'], estimate['Mu hat'] = [], [], [], [], []
    squared_error['IS'] = []
    squared_error['STEP IS'] = []
    squared_error['WIS'] = []
    squared_error['STEP WIS'] = []
    squared_error['Mu hat'] = []
    estimate['IH_SN'] = []
    squared_error['IH_SN'] = []
    estimate['IH_no_SN'] = []
    squared_error['IH_no_SN'] = []
    estimate['MB'] = []
    squared_error['MB'] = []

    ###* Looping over the number of baseline experiments
    for _ in range(cfg.n_experiments):
        behavior_data = training_data[_]
        behavior_data_processed = training_data_processed[_]

        IS = importance_sampling_estimator(behavior_data, mu, pi, gamma)
        step_IS = importance_sampling_estimator_stepwise(
            behavior_data, mu, pi, gamma)
        WIS = weighted_importance_sampling_estimator(behavior_data, mu, pi,
                                                     gamma)
        step_WIS = weighted_importance_sampling_estimator_stepwise(
            behavior_data, mu, pi, gamma)
        estimate['IS'].append(float(IS))
        squared_error['IS'].append(float((IS - v_pi)**2))
        estimate['STEP IS'].append(float(step_IS))
        squared_error['STEP IS'].append(float((step_IS - v_pi)**2))
        estimate['WIS'].append(float(WIS))
        squared_error['WIS'].append(float((WIS - v_pi)**2))
        estimate['STEP WIS'].append(float(step_WIS))
        squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2))
        MB = model_based(n_states, n_actions, behavior_data, pi, gamma)
        estimate['MB'].append(float(MB))
        squared_error['MB'].append(float((MB - v_pi)**2))
        IH, IH_unnormalized = lihong_infinite_horizon(n_states, behavior_data,
                                                      mu, pi, gamma)
        estimate['IH_SN'].append(float(IH))
        squared_error['IH_SN'].append(float((IH - v_pi)**2))
        estimate['IH_no_SN'].append(float(IH_unnormalized))
        squared_error['IH_no_SN'].append(float((IH_unnormalized - v_pi)**2))

    display((estimate, squared_error))
    print('exp seed:', cfg.initial_seed)

    # pdb.set_trace()
    if RUN_SANITY_CHECK:
        #! Let's run some additional sanity check
        #* check to see if bias formula checks out
        v_w = 0
        normalization = 0
        for trajectory in behavior_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                v_w += w_star[s] * pi[s, a] / mu[s, a] * r * discounted_t
                normalization += discounted_t
                discounted_t *= gamma
        v_w = v_w / normalization

        on_policy_data, frequency, avg_reward = roll_out(
            env, pi, 4096, horizon)
        # pdb.set_trace()
        empirical_v_pi = np.zeros(n_states)
        empirical_d_pi = np.zeros(n_states)
        empirical_d0 = np.zeros(n_states)
        empirical_r_pi = np.zeros(n_states)
        empirical_frequency = np.zeros(n_states)
        empirical_P = np.zeros((n_states, n_actions, n_states))
        horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
        num_traj = len(on_policy_data)
        for trajectory in on_policy_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                empirical_v_pi[s] += r * discounted_t
                empirical_d_pi[s] += discounted_t
                # empirical_d0[s] += 1-discounted_t
                discounted_t *= gamma
                empirical_r_pi[s] += r
                empirical_frequency[s] += 1
                empirical_P[s, a, sn] += 1
        empirical_v_pi = empirical_v_pi / num_traj
        empirical_d_pi = empirical_d_pi / horizon_normalization / num_traj
        empirical_P = np.nan_to_num(empirical_P /
                                    np.sum(empirical_P, axis=-1)[:, :, None])
        # T = np.nan_to_num(T/np.sum(T, axis = -1)[:,:,None])
        empirical_r_pi = np.nan_to_num(empirical_r_pi / empirical_frequency)
        empirical_P_pi = np.einsum('san,sa->sn', empirical_P, pi)

        empirical_d_mu = np.zeros(n_states)
        num_traj = len(behavior_data)
        for trajectory in behavior_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                empirical_d_mu[s] += discounted_t
                discounted_t *= gamma
        empirical_d_mu = empirical_d_mu / horizon_normalization / num_traj

        empirical_w = np.nan_to_num(empirical_d_pi / empirical_d_mu)
        empirical_loss = L(empirical_w, empirical_v_pi)

        empirical_bellman_original = 0
        empirical_bellman_new = 0
        empirical_td_error = 0
        num_traj = len(on_policy_data)
        empirical_r_pi_adjusted = np.dot(
            (np.identity(n_states) -
             np.linalg.matrix_power(gamma * empirical_P_pi, horizon)),
            empirical_r_pi)
        for trajectory in on_policy_data:
            discounted_t = 1.0
            for s, a, sn, r in trajectory:
                empirical_bellman_original += discounted_t * (
                    v_pi_s[s] - empirical_r_pi[s] -
                    gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2
                empirical_bellman_new += discounted_t * (
                    v_pi_s[s] - empirical_r_pi_adjusted[s] -
                    gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2
                empirical_td_error += discounted_t * (v_pi_s[s] - r -
                                                      gamma * v_pi_s[sn])**2
                discounted_t *= gamma
        empirical_td_error = empirical_td_error / horizon_normalization / num_traj
        empirical_bellman_original = empirical_bellman_original / horizon_normalization / num_traj
        empirical_bellman_new = empirical_bellman_new / horizon_normalization / num_traj
        # empirical_bellman_original = empirical_v_pi - empirical_r_pi - gamma*np.dot(empirical_P_pi, empirical_v_pi)

        # bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s)
        # bellman_new = v_pi_s - np.dot((np.identity(n_states) - np.linalg.matrix_power(gamma*P_pi, horizon)),R_pi) - gamma*np.dot(P_pi, v_pi_s)
        pdb.set_trace()

    for objective in cfg.objective:
        estimate[objective] = []
        squared_error[objective] = []
        objective_sn = objective + '-SN'
        estimate[objective_sn] = []
        squared_error[objective_sn] = []

    for i in range(cfg.n_experiments):
        training_set = training_data_processed[i]
        fixed_terminal_value = True
        logging = cfg.logging
        mvm = Tabular_State_MVM_Estimator(training_set,
                                          cfg,
                                          logging=logging,
                                          ground_truth=ground_truth_info)
        penalty = cfg.penalty_input

        horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
        # penalty_base = 1/mdp_calculator.horizon_normalization#/cfg.n_trajectories
        penalty_base = 1 / horizon_normalization
        mvm.set_random_seed(
            seed_list[i])  #different random seed per experiment
        mvm.solve_closed_form_bias()
        mvm.generate_random_v_class(cfg.v_class_cardinality)
        mvm.generate_random_w_class(cfg.v_class_cardinality)
        # mvm.bias_check()
        for objective in cfg.objective:
            mvm.set_random_seed(seed_list[i])
            # w_estimator = mvm.optimize_finite_class(objective = objective, td_penalty=penalty*penalty_base)
            # w_estimator = mvm.optimize_discrete(objective = objective, td_penalty=penalty*penalty_base)
            w_estimator = mvm.optimize(objective, td_penalty=0.1)
            # w_estimator, w_estimator_sn = mvm.optimize_optimistic()
            # w_estimator, w_estimator_sn = mvm.optimize_optimistic_adam(objective = objective, td_penalty=penalty*penalty_base)
            # w_estimator = mvm.optimize_closed_form()
            estimate[objective].append(float(w_estimator))
            # objective_sn = objective + '-SN'
            # estimate[objective_sn].append(float(w_estimator_sn))
            squared_error[objective].append(float(w_estimator - v_pi)**2)
            # squared_error[objective_sn].append(float(w_estimator_sn-v_pi)**2)
        display((estimate, squared_error))

    display((estimate, squared_error))
Пример #7
0
# -*- coding: utf-8 -*-
from gridworld import GridWorldEnv
from gym import spaces

env = GridWorldEnv(n_width=12,
                   n_height=4,
                   u_size=60,
                   default_reward=-1,
                   default_type=0,
                   windy=False)
env.action_space = spaces.Discrete(4)  # set action space num
env.start = (0, 0)
env.ends = [(11, 0)]
# set cliff
for i in range(10):
    env.rewards.append((i+1, 0, -100))   # set special reward
    env.ends.append((i+1, 0))   # set cliff all terminal states
# make set take effect
env.refresh_setting()
# 环境初始化
env.reset()
# UI show
env.render()
# input("press any key to continue")
for _ in range(20000):
    env.render()
    a = env.action_space.sample()
    state, reward, isdone, info = env.step(a)
    print("{0}, {1}, {2}, {3}".format(a, reward, isdone, info))

print("env closed")
Пример #8
0
import numpy as np
from gridworld import GridWorldEnv

env = GridWorldEnv()


def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI environment. env.P represents the transition probabilities of the environment.
        theta: Stopping threshold. If the value of all states changes less than theta
            in one iteration we are done.
        discount_factor: lambda time discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)
def main(cfg):
    initial_seed = cfg.initial_seed
    random.seed(initial_seed)
    np.random.seed(initial_seed)
    gamma = cfg.gamma
    # n_trajectories_list = cfg.n_trajectories
    # for n_trajectories in n_trajectories_list:
    # n_trajectories = cfg.n_trajectories
    horizon = cfg.horizon
    horizon_normalization = (1 - gamma**horizon) / (
        1 - gamma) if gamma < 1 else horizon
    seed_list = [
        initial_seed + np.random.randint(0, 10000) * i
        for i in range(cfg.n_experiments)
    ]  # generate a list of random seeds
    if cfg.env == 'grid_world':
        from gridworld import GridWorldEnv
        env = GridWorldEnv()
    elif cfg.env == 'taxi':
        from taxi import TaxiEnv
        env = TaxiEnv()

    n_states = env.nS
    n_actions = env.nA
    P = env.P_matrix
    R = env.R_matrix.copy()
    d0 = env.isd
    q_star_original = env.value_iteration()
    pi = env.extract_policy(q_star_original, temperature=0.3)
    mu = env.extract_policy(q_star_original, temperature=0.1)
    # pi = env.extract_policy(q_star_original, temperature=0.1)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # pi = env.extract_policy(q_star_original, temperature=0.3)
    # mu = env.extract_policy(q_star_original, temperature=0.15)
    # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy()
    #* 4 swapped cyclic
    # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,0].copy()
    #* D swapped with R, L swapped with U
    # mu = pi.copy(); mu[:,0] = pi[:,3].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy(); mu[:,3] = pi[:,0].copy()
    # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,4].copy();mu[:,4] = pi[:,5].copy();mu[:,5] = pi[:,0].copy()

    dpi, dpi_t, v_pi_s, q_pi_sa, P_pi = exact_calculation(
        env, pi, cfg.horizon, cfg.gamma)
    dmu, dmu_t, vmu_s, qmu_sa, P_mu = exact_calculation(
        env, mu, cfg.horizon, cfg.gamma)
    w_star = np.nan_to_num(dpi / dmu)
    v_pi = np.sum(d0 * v_pi_s)
    v_mu = np.sum(d0 * vmu_s)

    dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1])
    dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1])

    ground_truth_info = AttrDict({})
    ground_truth_info.update({
        'd_pi': torch.tensor(dpi, dtype=dtype),
        'd_mu': torch.tensor(dmu, dtype=dtype),
        'v_pi': torch.tensor(v_pi_s, dtype=dtype),
        'q_pi': torch.tensor(q_pi_sa, dtype=dtype),
        'v_star': v_pi
    })
    ground_truth_info.update({'w_pi': w_star})
    ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)})
    ground_truth_info.update({
        'pi': torch.tensor(pi, dtype=dtype),
        'mu': torch.tensor(mu, dtype=dtype)
    })
    true_rho = torch.tensor(pi / mu, dtype=dtype)
    true_rho[true_rho != true_rho] = 0
    true_rho[torch.isinf(true_rho)] = 0
    ground_truth_info.update({'rho': true_rho})
    ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)})
    ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)})
    ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)})
    ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)})
    ground_truth_info.update({
        'd_pi_t': torch.tensor(dpi_t, dtype=dtype),
        'd_mu_t': torch.tensor(dmu_t, dtype=dtype)
    })

    estimate = {}
    squared_error = {}
    estimate.update({'True pi': [float(v_pi)]})
    squared_error.update({'True pi': [0]})
    estimate.update({'True mu': [float(v_mu)]})
    squared_error.update({'True mu': [float(v_mu - v_pi)**2]})

    results = {}
    results['trajectories'] = []
    results['IS'] = []
    results['IH'] = []
    results['MB'] = []
    results['WIS'] = []
    results['STEP WIS'] = []
    results['STEP IS'] = []
    results['True mu'] = []
    for objective in cfg.objective:
        results[objective] = []

    n_trajectories_list = cfg.n_trajectories
    for n_trajectories in n_trajectories_list:
        print('------------------------')
        #* Generate multiple sets of behavior data from mu
        training_data = []
        training_data_processed = []
        for _ in range(cfg.n_experiments):
            # print('Experiment:',_)
            # print('------------------------')
            np.random.seed(seed_list[_])
            env.seed(seed_list[_])
            behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon)
            behavior_data_processed = prepare_behavior_data(behavior_data)
            training_data.append(behavior_data)
            training_data_processed.append(behavior_data_processed)
        estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[
            'STEP WIS'], estimate['Mu hat'] = [], [], [], [], []
        squared_error['IS'] = []
        squared_error['STEP IS'] = []
        squared_error['WIS'] = []
        squared_error['STEP WIS'] = []
        squared_error['Mu hat'] = []
        estimate['IH_SN'] = []
        squared_error['IH_SN'] = []
        estimate['IH_no_SN'] = []
        squared_error['IH_no_SN'] = []
        estimate['MB'] = []
        squared_error['MB'] = []
        ###* Looping over the number of baseline experiments
        for _ in range(cfg.n_experiments):
            behavior_data = training_data[_]
            behavior_data_processed = training_data_processed[_]

            IS = importance_sampling_estimator(behavior_data, mu, pi, gamma)
            step_IS = importance_sampling_estimator_stepwise(
                behavior_data, mu, pi, gamma)
            WIS = weighted_importance_sampling_estimator(
                behavior_data, mu, pi, gamma)
            step_WIS = weighted_importance_sampling_estimator_stepwise(
                behavior_data, mu, pi, gamma)
            estimate['IS'].append(float(IS))
            squared_error['IS'].append(float((IS - v_pi)**2))
            estimate['STEP IS'].append(float(step_IS))
            squared_error['STEP IS'].append(float((step_IS - v_pi)**2))
            estimate['WIS'].append(float(WIS))
            squared_error['WIS'].append(float((WIS - v_pi)**2))
            estimate['STEP WIS'].append(float(step_WIS))
            squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2))
            MB = model_based(n_states, n_actions, behavior_data, pi, gamma)
            estimate['MB'].append(float(MB))
            squared_error['MB'].append(float((MB - v_pi)**2))
            IH, IH_unnormalized = lihong_infinite_horizon(
                n_states, behavior_data, mu, pi, gamma)
            estimate['IH_SN'].append(float(IH))
            squared_error['IH_SN'].append(float((IH - v_pi)**2))
            estimate['IH_no_SN'].append(float(IH_unnormalized))
            squared_error['IH_no_SN'].append(float(
                (IH_unnormalized - v_pi)**2))

        # display((estimate, squared_error))
        # print('exp seed:', cfg.initial_seed)
        # pdb.set_trace()
        results['trajectories'].append(np.log2(n_trajectories))
        results['IH'].append(
            np.log2(
                sum(squared_error['IH_SN']) / len(squared_error['IH_SN']) /
                v_pi**2))
        results['MB'].append(
            np.log2(
                sum(squared_error['MB']) / len(squared_error['IH_SN']) /
                v_pi**2))
        results['IS'].append(
            np.log2(
                sum(squared_error['IS']) / len(squared_error['IS']) / v_pi**2))
        results['WIS'].append(
            np.log2(
                sum(squared_error['WIS']) / len(squared_error['WIS']) /
                v_pi**2))
        results['STEP WIS'].append(
            np.log2(
                sum(squared_error['STEP WIS']) /
                len(squared_error['STEP WIS']) / v_pi**2))
        results['STEP IS'].append(
            np.log2(
                sum(squared_error['STEP IS']) / len(squared_error['STEP IS']) /
                v_pi**2))
        results['True mu'].append(
            np.log2(
                sum(squared_error['True mu']) / len(squared_error['True mu']) /
                v_pi**2))

        for objective in cfg.objective:
            estimate[objective] = []
            squared_error[objective] = []

        # for i in range(cfg.n_experiments):
        #     training_set = training_data_processed[i]
        #     mvm = Tabular_State_MVM_Estimator(training_set, cfg, ground_truth = ground_truth_info)
        #     for objective in cfg.objective:
        #         mvm.set_random_seed(seed_list[i])
        #         w_estimator = mvm.optimize(objective)
        #         estimate[objective].append(float(w_estimator))
        #         squared_error[objective].append(float(w_estimator-v_pi)**2)
        #     display((estimate, squared_error))

        for i in range(cfg.n_experiments):
            training_set = training_data_processed[i]
            mvm = Tabular_State_MVM_Estimator(training_set,
                                              cfg,
                                              ground_truth=ground_truth_info)
            for objective in cfg.objective:
                mvm.set_random_seed(seed_list[i])
                w_estimator = mvm.optimize(objective)
                estimate[objective].append(float(w_estimator))
                squared_error[objective].append(float(w_estimator - v_pi)**2)
        # display((estimate, squared_error))
        for objective in cfg.objective:
            results[objective].append(
                np.log2(
                    sum(squared_error[objective]) /
                    len(squared_error[objective]) / v_pi**2))
        display((estimate, squared_error), n_trajectories)
        print('\n')
        print('End of one set of experiments')

    # pdb.set_trace()
    df = pd.DataFrame(results)
    # plt.plot(results['trajectories'], results['IH'],marker='o', markerfacecolor='blue', markersize=12, color='blue', linewidth=4)
    # plt.plot(results['trajectories'], results['MB'],marker='o', markerfacecolor='red', markersize=12, color='red', linewidth=4)
    # plt.plot(results['trajectories'], results['STEP WIS'],marker='o', markerfacecolor='aqua', markersize=12, color='aqua', linewidth=4)
    # plt.plot(results['trajectories'], results['STEP IS'],marker='o', markerfacecolor='orange', markersize=12, color='orange', linewidth=4)
    markersize = 8
    linewidth = 4
    plt.plot('trajectories',
             'STEP WIS',
             data=df,
             marker='o',
             markerfacecolor='slategrey',
             markersize=markersize,
             color='slategrey',
             linewidth=linewidth)
    plt.plot('trajectories',
             'STEP IS',
             data=df,
             marker='o',
             markerfacecolor='rosybrown',
             markersize=markersize,
             color='rosybrown',
             linewidth=linewidth)
    plt.plot('trajectories',
             'True mu',
             data=df,
             marker='o',
             markerfacecolor='black',
             markersize=markersize,
             color='black',
             linewidth=linewidth)
    # plt.plot('trajectories', 'MWL', data=df, marker='o', markerfacecolor='green', markersize=markersize, color='green', linewidth=linewidth)
    # plt.plot('trajectories', 'LSTDQ', data=df, marker='o', markerfacecolor='olive', markersize=markersize, color='olive', linewidth=linewidth)
    plt.plot('trajectories',
             'IH',
             data=df,
             marker='o',
             markerfacecolor='purple',
             markersize=markersize,
             color='purple',
             linewidth=linewidth)
    plt.plot('trajectories',
             'MB',
             data=df,
             marker='o',
             markerfacecolor='gold',
             markersize=markersize,
             color='gold',
             linewidth=linewidth)
    plt.plot('trajectories',
             'TD-ball center',
             data=df,
             marker='p',
             markerfacecolor='cadetblue',
             markersize=markersize,
             color='cadetblue',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias',
             data=df,
             marker='s',
             markerfacecolor='skyblue',
             markersize=markersize,
             color='skyblue',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias_td',
             data=df,
             marker='s',
             markerfacecolor='darkred',
             markersize=markersize,
             color='darkred',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias_td_var',
             data=df,
             marker='s',
             markerfacecolor='orange',
             markersize=markersize,
             color='orange',
             linewidth=linewidth)
    # plt.xticks(cfg.n_trajectories)
    plt.xticks(results['trajectories'])
    plt.xlabel('log number of trajectories')
    plt.ylabel('log MSE')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, 1.05),
               ncol=3,
               prop={'size': 8})
    plt.savefig('pi_03_mu_01_grid_misspecified_w.png')
    pdb.set_trace()