Python GridWorldEnv примеры использования

Язык программирования: Python

Пространство имен/Пакет: gridworld

Класс/Тип: GridWorldEnv

Примеров на hotexamples.com: 9

Python GridWorldEnv - 9 примеров найдено. Это лучшие примеры Python кода для gridworld.GridWorldEnv, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GridWorldEnv(7)

action_space(4)

extract_policy(2)

reset(2)

seed(2)

step(2)

value_iteration(2)

_render(1)

_reset(1)

ends(1)

refresh_setting(1)

render(1)

render_policy(1)

start(1)

types(1)

Пример #1

Показать файл

    #     plt.tick_params(labelleft='off')

    # general title
    plt.suptitle("Delta-Iterations comparation",
                 fontsize=13,
                 fontweight=0,
                 color="black",
                 style="italic")
    plt.tight_layout()
    plt.show()
    # Axis title
    # plt.text(0.5, 0.02, 'Time', ha='center', va='center')
    # plt.text(0.06, 0.5, 'Note', ha='center', va='center', rotation='vertical')


env = GridWorldEnv()

print("Policy evaluation using policy evaluation")
state_values, _ = dp.policy_evaluation(env=env, discount_factor=0.9)
print("Value States-> \n", state_values)
# print_state_latex(state_values)

policy, state_values, deltas_value, t = dp.value_iteration(env,
                                                           discount_factor=0.9)

print("Optimal policy found using Value Iteration algorithm  [0.9] found in ")
print("Elapsed time: %.5f [sec]" % (t[0]))
print("CPU elapsed time: %.5f [sec]" % (t[1]))
env.render_policy(policy=policy)
print(state_values)
# print_state_latex(state_values)

Пример #2

Показать файл

num_iterations = 10000  # @param

initial_collect_steps = 1000  # @param
collect_steps_per_iteration = 1  # @param
replay_buffer_capacity = 100000  # @param

fc_layer_params = (100, )

batch_size = 128  # @param
learning_rate = 1e-5  # @param
log_interval = 200  # @param

num_eval_episodes = 2  # @param
eval_interval = 1000  # @param

train_py_env = wrappers.TimeLimit(GridWorldEnv(), duration=100)
eval_py_env = wrappers.TimeLimit(GridWorldEnv(), duration=100)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

q_net = q_network.QNetwork(train_env.observation_spec(),
                           train_env.action_spec(),
                           fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),

Пример #3

Показать файл

Файл: test.py Проект: hikekang/hkx_tf_practice

# https://blog.csdn.net/c602273091/article/details/79008755
import gym
from gridworld import GridWorldEnv
from gym import spaces

env = GridWorldEnv(
    n_width=12,  # 水平方向格子数量
    n_height=4,  # 垂直方向格子数量
    u_size=60,  # 可以根据喜好调整大小
    default_reward=-1,  # 默认格子的即时奖励值
    default_type=0)  # 默认的格子都是可以进入的
env.action_space = spaces.Discrete(4)  # 设置行为空间数量
# 格子世界环境类默认使用0表示左，1：右，2：上，3:下，4,5,6,7为斜向行走
# 具体可参考_step内的定义
# 格子世界的观测空间不需要额外设置，会自动根据传输的格子数量计算得到
env.start = (0, 0)
env.ends = [(11, 0)]
for i in range(10):
    env.rewards.append((i + 1, 0, -100))
    env.ends.append((i + 1, 0))
env.types = [(5, 1, 1), (5, 2, 1)]
env.refresh_setting()
env._reset()
env._render()
input("press any key to continue...")

Пример #4

Показать файл

Файл: qln.py Проект: briancpn11/DRL

        return action

    def learn(self, state, action, reward, state_, done):

        q_target = reward + self.gamma * np.max(self.q_table[state_, :]) * done
        q_delta = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.alpha * q_delta

        self.epsilon = self.epsilon * self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min


from gridworld import GridWorldEnv

if __name__ == "__main__":
    env = GridWorldEnv()
    n_games = 500
    agent = Agent(alpha=0.6,
                  gamma=1.0,
                  epsilon=0.5,
                  n_states=env.stateCount,
                  n_actions=env.actionCount)
    stateDict = env.stateDict
    scores = []

    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(stateDict.get(observation))

Пример #5

Показать файл

from gridworld import GridWorldEnv
from gym import spaces

env = GridWorldEnv(n_width = 12,
                   n_height=4,
                   u_size=60,
                   default_reward = -1,
                   default_type = 0,
                   windy = False)

env.action_space = spaces.Discrete(4)

env.start = (0,0)
env.ends = [(11,0)]

for i in range(10):
    env.rewards.append((i+1,0,-100))
    env.ends.append((i+1,0))

env.types = [(5,1,1),(5,2,1)]

env.refresh_setting()

env.reset()

env.render()
input("press any key to continue...")

Пример #6

Показать файл

def main(cfg):
    initial_seed = cfg.initial_seed
    random.seed(initial_seed)
    np.random.seed(initial_seed)
    gamma = cfg.gamma
    n_trajectories = cfg.n_trajectories
    horizon = cfg.horizon
    horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
    processor = lambda x: x
    seed_list = [
        initial_seed + np.random.randint(0, 10000) * i
        for i in range(cfg.n_experiments)
    ]  # generate a list of random seeds
    if cfg.env == 'grid_world':
        from gridworld import GridWorldEnv
        env = GridWorldEnv()
    elif cfg.env == 'taxi':
        from taxi import TaxiEnv
        env = TaxiEnv()

    n_states = env.nS
    n_actions = env.nA
    P = env.P_matrix
    R = env.R_matrix.copy()
    d0 = env.isd

    q_star_original = env.value_iteration()
    # pi_prob = gymEnv.extract_policy(q_star_original, temperature=0.05)
    # mu_prob = gymEnv.extract_policy(q_star_original, temperature=1)
    pi = env.extract_policy(q_star_original, temperature=0.1)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # pi = env.extract_policy(q_star_original, temperature=0.15)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy()
    mu = pi.copy()
    mu[:, 0] = pi[:, 1].copy()
    mu[:, 1] = pi[:, 2].copy()
    mu[:, 2] = pi[:, 3].copy()
    mu[:, 3] = pi[:, 0].copy()

    dpi, dpi_t, v_pi_s, P_pi = exact_calculation(env, pi, cfg.horizon,
                                                 cfg.gamma)
    dmu, dmu_t, vmu_s, P_mu = exact_calculation(env, mu, cfg.horizon,
                                                cfg.gamma)
    #! sanity check the loss objective
    #* verify the claim that L(w*,f) = 0 for all f, where
    #* L(w,f) = \E_{(s,a,s')\sim d_mu} [ w(s) (f(s) - gamm*rho(s,a)*f(s'))] +1/h E_{s\sim d0} [f(s)] - 1/h *gamma^horizon \E_{s\sim d_pi,H}[f(s)]
    # determine w_star
    w_star = np.nan_to_num(dpi / dmu)
    v_pi = np.sum(d0 * v_pi_s)
    v_mu = np.sum(d0 * vmu_s)

    dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1])
    dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1])
    if RUN_SANITY_CHECK:

        def L(w, f):
            loss = 0
            for s in range(n_states):
                for a in range(n_actions):
                    for sn in range(n_states):
                        loss += w[s] * (-f[s] + gamma * pi[s, a] / mu[s, a] *
                                        f[sn]) * dmu[s] * mu[s, a] * P[s, a,
                                                                       sn]

            loss += 1 / horizon_normalization * np.sum(d0 * f)
            loss -= 1 / horizon_normalization * gamma**horizon * np.sum(
                dpi_H * f)
            return loss

        f = np.random.rand(n_states)
        loss = L(w_star, f)
        assert abs(loss) < 1e-8

        #! sanity check bellman and td error
        R_pi = np.sum(R * pi, axis=-1)
        bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s)
        bellman_new = v_pi_s - np.dot(
            (np.identity(n_states) - np.linalg.matrix_power(
                gamma * P_pi, horizon)), R_pi) - gamma * np.dot(P_pi, v_pi_s)
        pdb.set_trace()

    ground_truth_info = AttrDict({})
    ground_truth_info.update({
        'd_pi': torch.tensor(dpi, dtype=dtype),
        'd_mu': torch.tensor(dmu, dtype=dtype),
        'v_pi': torch.tensor(v_pi_s, dtype=dtype),
        'v_star': v_pi
    })
    ground_truth_info.update({'w_pi': w_star})
    ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)})
    ground_truth_info.update({
        'pi': torch.tensor(pi, dtype=dtype),
        'mu': torch.tensor(mu, dtype=dtype)
    })
    true_rho = torch.tensor(pi / mu, dtype=dtype)
    true_rho[true_rho != true_rho] = 0
    true_rho[torch.isinf(true_rho)] = 0
    ground_truth_info.update({'rho': true_rho})
    ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)})
    ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)})
    ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)})
    ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)})
    ground_truth_info.update({
        'd_pi_t': torch.tensor(dpi_t, dtype=dtype),
        'd_mu_t': torch.tensor(dmu_t, dtype=dtype)
    })

    estimate = {}
    squared_error = {}
    estimate.update({'True pi': [float(v_pi)]})
    squared_error.update({'True pi': [0]})
    estimate.update({'True mu': [float(v_mu)]})
    squared_error.update({'True mu': [float(v_mu - v_pi)**2]})

    #* Generate multiple sets of behavior data from mu
    training_data = []
    training_data_processed = []
    for _ in range(cfg.n_experiments):
        print('Experiment:', _)
        print('------------------------')
        np.random.seed(seed_list[_])
        env.seed(seed_list[_])
        # behavior_data = rollout(env, mu, processor, absorbing_state, pi_e = pi, N=n_trajectories, T=horizon, frameskip=1, frameheight=1, path=None, filename='tmp',)
        behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon)
        behavior_data_processed = prepare_behavior_data(behavior_data)
        training_data.append(behavior_data)
        training_data_processed.append(behavior_data_processed)
        # pdb.set_trace()
    estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[
        'STEP WIS'], estimate['Mu hat'] = [], [], [], [], []
    squared_error['IS'] = []
    squared_error['STEP IS'] = []
    squared_error['WIS'] = []
    squared_error['STEP WIS'] = []
    squared_error['Mu hat'] = []
    estimate['IH_SN'] = []
    squared_error['IH_SN'] = []
    estimate['IH_no_SN'] = []
    squared_error['IH_no_SN'] = []
    estimate['MB'] = []
    squared_error['MB'] = []

    ###* Looping over the number of baseline experiments
    for _ in range(cfg.n_experiments):
        behavior_data = training_data[_]
        behavior_data_processed = training_data_processed[_]

        IS = importance_sampling_estimator(behavior_data, mu, pi, gamma)
        step_IS = importance_sampling_estimator_stepwise(
            behavior_data, mu, pi, gamma)
        WIS = weighted_importance_sampling_estimator(behavior_data, mu, pi,
                                                     gamma)
        step_WIS = weighted_importance_sampling_estimator_stepwise(
            behavior_data, mu, pi, gamma)
        estimate['IS'].append(float(IS))
        squared_error['IS'].append(float((IS - v_pi)**2))
        estimate['STEP IS'].append(float(step_IS))
        squared_error['STEP IS'].append(float((step_IS - v_pi)**2))
        estimate['WIS'].append(float(WIS))
        squared_error['WIS'].append(float((WIS - v_pi)**2))
        estimate['STEP WIS'].append(float(step_WIS))
        squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2))
        MB = model_based(n_states, n_actions, behavior_data, pi, gamma)
        estimate['MB'].append(float(MB))
        squared_error['MB'].append(float((MB - v_pi)**2))
        IH, IH_unnormalized = lihong_infinite_horizon(n_states, behavior_data,
                                                      mu, pi, gamma)
        estimate['IH_SN'].append(float(IH))
        squared_error['IH_SN'].append(float((IH - v_pi)**2))
        estimate['IH_no_SN'].append(float(IH_unnormalized))
        squared_error['IH_no_SN'].append(float((IH_unnormalized - v_pi)**2))

    display((estimate, squared_error))
    print('exp seed:', cfg.initial_seed)

    # pdb.set_trace()
    if RUN_SANITY_CHECK:
        #! Let's run some additional sanity check
        #* check to see if bias formula checks out
        v_w = 0
        normalization = 0
        for trajectory in behavior_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                v_w += w_star[s] * pi[s, a] / mu[s, a] * r * discounted_t
                normalization += discounted_t
                discounted_t *= gamma
        v_w = v_w / normalization

        on_policy_data, frequency, avg_reward = roll_out(
            env, pi, 4096, horizon)
        # pdb.set_trace()
        empirical_v_pi = np.zeros(n_states)
        empirical_d_pi = np.zeros(n_states)
        empirical_d0 = np.zeros(n_states)
        empirical_r_pi = np.zeros(n_states)
        empirical_frequency = np.zeros(n_states)
        empirical_P = np.zeros((n_states, n_actions, n_states))
        horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
        num_traj = len(on_policy_data)
        for trajectory in on_policy_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                empirical_v_pi[s] += r * discounted_t
                empirical_d_pi[s] += discounted_t
                # empirical_d0[s] += 1-discounted_t
                discounted_t *= gamma
                empirical_r_pi[s] += r
                empirical_frequency[s] += 1
                empirical_P[s, a, sn] += 1
        empirical_v_pi = empirical_v_pi / num_traj
        empirical_d_pi = empirical_d_pi / horizon_normalization / num_traj
        empirical_P = np.nan_to_num(empirical_P /
                                    np.sum(empirical_P, axis=-1)[:, :, None])
        # T = np.nan_to_num(T/np.sum(T, axis = -1)[:,:,None])
        empirical_r_pi = np.nan_to_num(empirical_r_pi / empirical_frequency)
        empirical_P_pi = np.einsum('san,sa->sn', empirical_P, pi)

        empirical_d_mu = np.zeros(n_states)
        num_traj = len(behavior_data)
        for trajectory in behavior_data:
            discounted_t = 1
            for s, a, sn, r in trajectory:
                empirical_d_mu[s] += discounted_t
                discounted_t *= gamma
        empirical_d_mu = empirical_d_mu / horizon_normalization / num_traj

        empirical_w = np.nan_to_num(empirical_d_pi / empirical_d_mu)
        empirical_loss = L(empirical_w, empirical_v_pi)

        empirical_bellman_original = 0
        empirical_bellman_new = 0
        empirical_td_error = 0
        num_traj = len(on_policy_data)
        empirical_r_pi_adjusted = np.dot(
            (np.identity(n_states) -
             np.linalg.matrix_power(gamma * empirical_P_pi, horizon)),
            empirical_r_pi)
        for trajectory in on_policy_data:
            discounted_t = 1.0
            for s, a, sn, r in trajectory:
                empirical_bellman_original += discounted_t * (
                    v_pi_s[s] - empirical_r_pi[s] -
                    gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2
                empirical_bellman_new += discounted_t * (
                    v_pi_s[s] - empirical_r_pi_adjusted[s] -
                    gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2
                empirical_td_error += discounted_t * (v_pi_s[s] - r -
                                                      gamma * v_pi_s[sn])**2
                discounted_t *= gamma
        empirical_td_error = empirical_td_error / horizon_normalization / num_traj
        empirical_bellman_original = empirical_bellman_original / horizon_normalization / num_traj
        empirical_bellman_new = empirical_bellman_new / horizon_normalization / num_traj
        # empirical_bellman_original = empirical_v_pi - empirical_r_pi - gamma*np.dot(empirical_P_pi, empirical_v_pi)

        # bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s)
        # bellman_new = v_pi_s - np.dot((np.identity(n_states) - np.linalg.matrix_power(gamma*P_pi, horizon)),R_pi) - gamma*np.dot(P_pi, v_pi_s)
        pdb.set_trace()

    for objective in cfg.objective:
        estimate[objective] = []
        squared_error[objective] = []
        objective_sn = objective + '-SN'
        estimate[objective_sn] = []
        squared_error[objective_sn] = []

    for i in range(cfg.n_experiments):
        training_set = training_data_processed[i]
        fixed_terminal_value = True
        logging = cfg.logging
        mvm = Tabular_State_MVM_Estimator(training_set,
                                          cfg,
                                          logging=logging,
                                          ground_truth=ground_truth_info)
        penalty = cfg.penalty_input

        horizon_normalization = (1 - gamma**horizon) / (1 - gamma)
        # penalty_base = 1/mdp_calculator.horizon_normalization#/cfg.n_trajectories
        penalty_base = 1 / horizon_normalization
        mvm.set_random_seed(
            seed_list[i])  #different random seed per experiment
        mvm.solve_closed_form_bias()
        mvm.generate_random_v_class(cfg.v_class_cardinality)
        mvm.generate_random_w_class(cfg.v_class_cardinality)
        # mvm.bias_check()
        for objective in cfg.objective:
            mvm.set_random_seed(seed_list[i])
            # w_estimator = mvm.optimize_finite_class(objective = objective, td_penalty=penalty*penalty_base)
            # w_estimator = mvm.optimize_discrete(objective = objective, td_penalty=penalty*penalty_base)
            w_estimator = mvm.optimize(objective, td_penalty=0.1)
            # w_estimator, w_estimator_sn = mvm.optimize_optimistic()
            # w_estimator, w_estimator_sn = mvm.optimize_optimistic_adam(objective = objective, td_penalty=penalty*penalty_base)
            # w_estimator = mvm.optimize_closed_form()
            estimate[objective].append(float(w_estimator))
            # objective_sn = objective + '-SN'
            # estimate[objective_sn].append(float(w_estimator_sn))
            squared_error[objective].append(float(w_estimator - v_pi)**2)
            # squared_error[objective_sn].append(float(w_estimator_sn-v_pi)**2)
        display((estimate, squared_error))

    display((estimate, squared_error))

Пример #7

Показать файл

Файл: cliff_walk.py Проект: yyHaker/ReinforcementLearning

# -*- coding: utf-8 -*-
from gridworld import GridWorldEnv
from gym import spaces

env = GridWorldEnv(n_width=12,
                   n_height=4,
                   u_size=60,
                   default_reward=-1,
                   default_type=0,
                   windy=False)
env.action_space = spaces.Discrete(4)  # set action space num
env.start = (0, 0)
env.ends = [(11, 0)]
# set cliff
for i in range(10):
    env.rewards.append((i+1, 0, -100))   # set special reward
    env.ends.append((i+1, 0))   # set cliff all terminal states
# make set take effect
env.refresh_setting()
# 环境初始化
env.reset()
# UI show
env.render()
# input("press any key to continue")
for _ in range(20000):
    env.render()
    a = env.action_space.sample()
    state, reward, isdone, info = env.step(a)
    print("{0}, {1}, {2}, {3}".format(a, reward, isdone, info))

print("env closed")

Пример #8

Показать файл

import numpy as np
from gridworld import GridWorldEnv

env = GridWorldEnv()


def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI environment. env.P represents the transition probabilities of the environment.
        theta: Stopping threshold. If the value of all states changes less than theta
            in one iteration we are done.
        discount_factor: lambda time discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)

Пример #9

Показать файл

Файл: run_cf_experiment.py Проект: hoangminhle/New_OPE_experiments

def main(cfg):
    initial_seed = cfg.initial_seed
    random.seed(initial_seed)
    np.random.seed(initial_seed)
    gamma = cfg.gamma
    # n_trajectories_list = cfg.n_trajectories
    # for n_trajectories in n_trajectories_list:
    # n_trajectories = cfg.n_trajectories
    horizon = cfg.horizon
    horizon_normalization = (1 - gamma**horizon) / (
        1 - gamma) if gamma < 1 else horizon
    seed_list = [
        initial_seed + np.random.randint(0, 10000) * i
        for i in range(cfg.n_experiments)
    ]  # generate a list of random seeds
    if cfg.env == 'grid_world':
        from gridworld import GridWorldEnv
        env = GridWorldEnv()
    elif cfg.env == 'taxi':
        from taxi import TaxiEnv
        env = TaxiEnv()

    n_states = env.nS
    n_actions = env.nA
    P = env.P_matrix
    R = env.R_matrix.copy()
    d0 = env.isd
    q_star_original = env.value_iteration()
    pi = env.extract_policy(q_star_original, temperature=0.3)
    mu = env.extract_policy(q_star_original, temperature=0.1)
    # pi = env.extract_policy(q_star_original, temperature=0.1)
    # mu = env.extract_policy(q_star_original, temperature=0.3)
    # pi = env.extract_policy(q_star_original, temperature=0.3)
    # mu = env.extract_policy(q_star_original, temperature=0.15)
    # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy()
    #* 4 swapped cyclic
    # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,0].copy()
    #* D swapped with R, L swapped with U
    # mu = pi.copy(); mu[:,0] = pi[:,3].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy(); mu[:,3] = pi[:,0].copy()
    # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,4].copy();mu[:,4] = pi[:,5].copy();mu[:,5] = pi[:,0].copy()

    dpi, dpi_t, v_pi_s, q_pi_sa, P_pi = exact_calculation(
        env, pi, cfg.horizon, cfg.gamma)
    dmu, dmu_t, vmu_s, qmu_sa, P_mu = exact_calculation(
        env, mu, cfg.horizon, cfg.gamma)
    w_star = np.nan_to_num(dpi / dmu)
    v_pi = np.sum(d0 * v_pi_s)
    v_mu = np.sum(d0 * vmu_s)

    dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1])
    dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1])

    ground_truth_info = AttrDict({})
    ground_truth_info.update({
        'd_pi': torch.tensor(dpi, dtype=dtype),
        'd_mu': torch.tensor(dmu, dtype=dtype),
        'v_pi': torch.tensor(v_pi_s, dtype=dtype),
        'q_pi': torch.tensor(q_pi_sa, dtype=dtype),
        'v_star': v_pi
    })
    ground_truth_info.update({'w_pi': w_star})
    ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)})
    ground_truth_info.update({
        'pi': torch.tensor(pi, dtype=dtype),
        'mu': torch.tensor(mu, dtype=dtype)
    })
    true_rho = torch.tensor(pi / mu, dtype=dtype)
    true_rho[true_rho != true_rho] = 0
    true_rho[torch.isinf(true_rho)] = 0
    ground_truth_info.update({'rho': true_rho})
    ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)})
    ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)})
    ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)})
    ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)})
    ground_truth_info.update({
        'd_pi_t': torch.tensor(dpi_t, dtype=dtype),
        'd_mu_t': torch.tensor(dmu_t, dtype=dtype)
    })

    estimate = {}
    squared_error = {}
    estimate.update({'True pi': [float(v_pi)]})
    squared_error.update({'True pi': [0]})
    estimate.update({'True mu': [float(v_mu)]})
    squared_error.update({'True mu': [float(v_mu - v_pi)**2]})

    results = {}
    results['trajectories'] = []
    results['IS'] = []
    results['IH'] = []
    results['MB'] = []
    results['WIS'] = []
    results['STEP WIS'] = []
    results['STEP IS'] = []
    results['True mu'] = []
    for objective in cfg.objective:
        results[objective] = []

    n_trajectories_list = cfg.n_trajectories
    for n_trajectories in n_trajectories_list:
        print('------------------------')
        #* Generate multiple sets of behavior data from mu
        training_data = []
        training_data_processed = []
        for _ in range(cfg.n_experiments):
            # print('Experiment:',_)
            # print('------------------------')
            np.random.seed(seed_list[_])
            env.seed(seed_list[_])
            behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon)
            behavior_data_processed = prepare_behavior_data(behavior_data)
            training_data.append(behavior_data)
            training_data_processed.append(behavior_data_processed)
        estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[
            'STEP WIS'], estimate['Mu hat'] = [], [], [], [], []
        squared_error['IS'] = []
        squared_error['STEP IS'] = []
        squared_error['WIS'] = []
        squared_error['STEP WIS'] = []
        squared_error['Mu hat'] = []
        estimate['IH_SN'] = []
        squared_error['IH_SN'] = []
        estimate['IH_no_SN'] = []
        squared_error['IH_no_SN'] = []
        estimate['MB'] = []
        squared_error['MB'] = []
        ###* Looping over the number of baseline experiments
        for _ in range(cfg.n_experiments):
            behavior_data = training_data[_]
            behavior_data_processed = training_data_processed[_]

            IS = importance_sampling_estimator(behavior_data, mu, pi, gamma)
            step_IS = importance_sampling_estimator_stepwise(
                behavior_data, mu, pi, gamma)
            WIS = weighted_importance_sampling_estimator(
                behavior_data, mu, pi, gamma)
            step_WIS = weighted_importance_sampling_estimator_stepwise(
                behavior_data, mu, pi, gamma)
            estimate['IS'].append(float(IS))
            squared_error['IS'].append(float((IS - v_pi)**2))
            estimate['STEP IS'].append(float(step_IS))
            squared_error['STEP IS'].append(float((step_IS - v_pi)**2))
            estimate['WIS'].append(float(WIS))
            squared_error['WIS'].append(float((WIS - v_pi)**2))
            estimate['STEP WIS'].append(float(step_WIS))
            squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2))
            MB = model_based(n_states, n_actions, behavior_data, pi, gamma)
            estimate['MB'].append(float(MB))
            squared_error['MB'].append(float((MB - v_pi)**2))
            IH, IH_unnormalized = lihong_infinite_horizon(
                n_states, behavior_data, mu, pi, gamma)
            estimate['IH_SN'].append(float(IH))
            squared_error['IH_SN'].append(float((IH - v_pi)**2))
            estimate['IH_no_SN'].append(float(IH_unnormalized))
            squared_error['IH_no_SN'].append(float(
                (IH_unnormalized - v_pi)**2))

        # display((estimate, squared_error))
        # print('exp seed:', cfg.initial_seed)
        # pdb.set_trace()
        results['trajectories'].append(np.log2(n_trajectories))
        results['IH'].append(
            np.log2(
                sum(squared_error['IH_SN']) / len(squared_error['IH_SN']) /
                v_pi**2))
        results['MB'].append(
            np.log2(
                sum(squared_error['MB']) / len(squared_error['IH_SN']) /
                v_pi**2))
        results['IS'].append(
            np.log2(
                sum(squared_error['IS']) / len(squared_error['IS']) / v_pi**2))
        results['WIS'].append(
            np.log2(
                sum(squared_error['WIS']) / len(squared_error['WIS']) /
                v_pi**2))
        results['STEP WIS'].append(
            np.log2(
                sum(squared_error['STEP WIS']) /
                len(squared_error['STEP WIS']) / v_pi**2))
        results['STEP IS'].append(
            np.log2(
                sum(squared_error['STEP IS']) / len(squared_error['STEP IS']) /
                v_pi**2))
        results['True mu'].append(
            np.log2(
                sum(squared_error['True mu']) / len(squared_error['True mu']) /
                v_pi**2))

        for objective in cfg.objective:
            estimate[objective] = []
            squared_error[objective] = []

        # for i in range(cfg.n_experiments):
        #     training_set = training_data_processed[i]
        #     mvm = Tabular_State_MVM_Estimator(training_set, cfg, ground_truth = ground_truth_info)
        #     for objective in cfg.objective:
        #         mvm.set_random_seed(seed_list[i])
        #         w_estimator = mvm.optimize(objective)
        #         estimate[objective].append(float(w_estimator))
        #         squared_error[objective].append(float(w_estimator-v_pi)**2)
        #     display((estimate, squared_error))

        for i in range(cfg.n_experiments):
            training_set = training_data_processed[i]
            mvm = Tabular_State_MVM_Estimator(training_set,
                                              cfg,
                                              ground_truth=ground_truth_info)
            for objective in cfg.objective:
                mvm.set_random_seed(seed_list[i])
                w_estimator = mvm.optimize(objective)
                estimate[objective].append(float(w_estimator))
                squared_error[objective].append(float(w_estimator - v_pi)**2)
        # display((estimate, squared_error))
        for objective in cfg.objective:
            results[objective].append(
                np.log2(
                    sum(squared_error[objective]) /
                    len(squared_error[objective]) / v_pi**2))
        display((estimate, squared_error), n_trajectories)
        print('\n')
        print('End of one set of experiments')

    # pdb.set_trace()
    df = pd.DataFrame(results)
    # plt.plot(results['trajectories'], results['IH'],marker='o', markerfacecolor='blue', markersize=12, color='blue', linewidth=4)
    # plt.plot(results['trajectories'], results['MB'],marker='o', markerfacecolor='red', markersize=12, color='red', linewidth=4)
    # plt.plot(results['trajectories'], results['STEP WIS'],marker='o', markerfacecolor='aqua', markersize=12, color='aqua', linewidth=4)
    # plt.plot(results['trajectories'], results['STEP IS'],marker='o', markerfacecolor='orange', markersize=12, color='orange', linewidth=4)
    markersize = 8
    linewidth = 4
    plt.plot('trajectories',
             'STEP WIS',
             data=df,
             marker='o',
             markerfacecolor='slategrey',
             markersize=markersize,
             color='slategrey',
             linewidth=linewidth)
    plt.plot('trajectories',
             'STEP IS',
             data=df,
             marker='o',
             markerfacecolor='rosybrown',
             markersize=markersize,
             color='rosybrown',
             linewidth=linewidth)
    plt.plot('trajectories',
             'True mu',
             data=df,
             marker='o',
             markerfacecolor='black',
             markersize=markersize,
             color='black',
             linewidth=linewidth)
    # plt.plot('trajectories', 'MWL', data=df, marker='o', markerfacecolor='green', markersize=markersize, color='green', linewidth=linewidth)
    # plt.plot('trajectories', 'LSTDQ', data=df, marker='o', markerfacecolor='olive', markersize=markersize, color='olive', linewidth=linewidth)
    plt.plot('trajectories',
             'IH',
             data=df,
             marker='o',
             markerfacecolor='purple',
             markersize=markersize,
             color='purple',
             linewidth=linewidth)
    plt.plot('trajectories',
             'MB',
             data=df,
             marker='o',
             markerfacecolor='gold',
             markersize=markersize,
             color='gold',
             linewidth=linewidth)
    plt.plot('trajectories',
             'TD-ball center',
             data=df,
             marker='p',
             markerfacecolor='cadetblue',
             markersize=markersize,
             color='cadetblue',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias',
             data=df,
             marker='s',
             markerfacecolor='skyblue',
             markersize=markersize,
             color='skyblue',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias_td',
             data=df,
             marker='s',
             markerfacecolor='darkred',
             markersize=markersize,
             color='darkred',
             linewidth=linewidth)
    plt.plot('trajectories',
             'bias_td_var',
             data=df,
             marker='s',
             markerfacecolor='orange',
             markersize=markersize,
             color='orange',
             linewidth=linewidth)
    # plt.xticks(cfg.n_trajectories)
    plt.xticks(results['trajectories'])
    plt.xlabel('log number of trajectories')
    plt.ylabel('log MSE')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, 1.05),
               ncol=3,
               prop={'size': 8})
    plt.savefig('pi_03_mu_01_grid_misspecified_w.png')
    pdb.set_trace()