# plt.tick_params(labelleft='off') # general title plt.suptitle("Delta-Iterations comparation", fontsize=13, fontweight=0, color="black", style="italic") plt.tight_layout() plt.show() # Axis title # plt.text(0.5, 0.02, 'Time', ha='center', va='center') # plt.text(0.06, 0.5, 'Note', ha='center', va='center', rotation='vertical') env = GridWorldEnv() print("Policy evaluation using policy evaluation") state_values, _ = dp.policy_evaluation(env=env, discount_factor=0.9) print("Value States-> \n", state_values) # print_state_latex(state_values) policy, state_values, deltas_value, t = dp.value_iteration(env, discount_factor=0.9) print("Optimal policy found using Value Iteration algorithm [0.9] found in ") print("Elapsed time: %.5f [sec]" % (t[0])) print("CPU elapsed time: %.5f [sec]" % (t[1])) env.render_policy(policy=policy) print(state_values) # print_state_latex(state_values)
num_iterations = 10000 # @param initial_collect_steps = 1000 # @param collect_steps_per_iteration = 1 # @param replay_buffer_capacity = 100000 # @param fc_layer_params = (100, ) batch_size = 128 # @param learning_rate = 1e-5 # @param log_interval = 200 # @param num_eval_episodes = 2 # @param eval_interval = 1000 # @param train_py_env = wrappers.TimeLimit(GridWorldEnv(), duration=100) eval_py_env = wrappers.TimeLimit(GridWorldEnv(), duration=100) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) q_net = q_network.QNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) tf_agent = dqn_agent.DqnAgent( train_env.time_step_spec(),
# https://blog.csdn.net/c602273091/article/details/79008755 import gym from gridworld import GridWorldEnv from gym import spaces env = GridWorldEnv( n_width=12, # 水平方向格子数量 n_height=4, # 垂直方向格子数量 u_size=60, # 可以根据喜好调整大小 default_reward=-1, # 默认格子的即时奖励值 default_type=0) # 默认的格子都是可以进入的 env.action_space = spaces.Discrete(4) # 设置行为空间数量 # 格子世界环境类默认使用0表示左,1:右,2:上,3:下,4,5,6,7为斜向行走 # 具体可参考_step内的定义 # 格子世界的观测空间不需要额外设置,会自动根据传输的格子数量计算得到 env.start = (0, 0) env.ends = [(11, 0)] for i in range(10): env.rewards.append((i + 1, 0, -100)) env.ends.append((i + 1, 0)) env.types = [(5, 1, 1), (5, 2, 1)] env.refresh_setting() env._reset() env._render() input("press any key to continue...")
return action def learn(self, state, action, reward, state_, done): q_target = reward + self.gamma * np.max(self.q_table[state_, :]) * done q_delta = q_target - self.q_table[state, action] self.q_table[state, action] += self.alpha * q_delta self.epsilon = self.epsilon * self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min from gridworld import GridWorldEnv if __name__ == "__main__": env = GridWorldEnv() n_games = 500 agent = Agent(alpha=0.6, gamma=1.0, epsilon=0.5, n_states=env.stateCount, n_actions=env.actionCount) stateDict = env.stateDict scores = [] for i in range(n_games): done = False score = 0 observation = env.reset() while not done: action = agent.choose_action(stateDict.get(observation))
from gridworld import GridWorldEnv from gym import spaces env = GridWorldEnv(n_width = 12, n_height=4, u_size=60, default_reward = -1, default_type = 0, windy = False) env.action_space = spaces.Discrete(4) env.start = (0,0) env.ends = [(11,0)] for i in range(10): env.rewards.append((i+1,0,-100)) env.ends.append((i+1,0)) env.types = [(5,1,1),(5,2,1)] env.refresh_setting() env.reset() env.render() input("press any key to continue...")
def main(cfg): initial_seed = cfg.initial_seed random.seed(initial_seed) np.random.seed(initial_seed) gamma = cfg.gamma n_trajectories = cfg.n_trajectories horizon = cfg.horizon horizon_normalization = (1 - gamma**horizon) / (1 - gamma) processor = lambda x: x seed_list = [ initial_seed + np.random.randint(0, 10000) * i for i in range(cfg.n_experiments) ] # generate a list of random seeds if cfg.env == 'grid_world': from gridworld import GridWorldEnv env = GridWorldEnv() elif cfg.env == 'taxi': from taxi import TaxiEnv env = TaxiEnv() n_states = env.nS n_actions = env.nA P = env.P_matrix R = env.R_matrix.copy() d0 = env.isd q_star_original = env.value_iteration() # pi_prob = gymEnv.extract_policy(q_star_original, temperature=0.05) # mu_prob = gymEnv.extract_policy(q_star_original, temperature=1) pi = env.extract_policy(q_star_original, temperature=0.1) # mu = env.extract_policy(q_star_original, temperature=0.3) # pi = env.extract_policy(q_star_original, temperature=0.15) # mu = env.extract_policy(q_star_original, temperature=0.3) # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy() mu = pi.copy() mu[:, 0] = pi[:, 1].copy() mu[:, 1] = pi[:, 2].copy() mu[:, 2] = pi[:, 3].copy() mu[:, 3] = pi[:, 0].copy() dpi, dpi_t, v_pi_s, P_pi = exact_calculation(env, pi, cfg.horizon, cfg.gamma) dmu, dmu_t, vmu_s, P_mu = exact_calculation(env, mu, cfg.horizon, cfg.gamma) #! sanity check the loss objective #* verify the claim that L(w*,f) = 0 for all f, where #* L(w,f) = \E_{(s,a,s')\sim d_mu} [ w(s) (f(s) - gamm*rho(s,a)*f(s'))] +1/h E_{s\sim d0} [f(s)] - 1/h *gamma^horizon \E_{s\sim d_pi,H}[f(s)] # determine w_star w_star = np.nan_to_num(dpi / dmu) v_pi = np.sum(d0 * v_pi_s) v_mu = np.sum(d0 * vmu_s) dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1]) dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1]) if RUN_SANITY_CHECK: def L(w, f): loss = 0 for s in range(n_states): for a in range(n_actions): for sn in range(n_states): loss += w[s] * (-f[s] + gamma * pi[s, a] / mu[s, a] * f[sn]) * dmu[s] * mu[s, a] * P[s, a, sn] loss += 1 / horizon_normalization * np.sum(d0 * f) loss -= 1 / horizon_normalization * gamma**horizon * np.sum( dpi_H * f) return loss f = np.random.rand(n_states) loss = L(w_star, f) assert abs(loss) < 1e-8 #! sanity check bellman and td error R_pi = np.sum(R * pi, axis=-1) bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s) bellman_new = v_pi_s - np.dot( (np.identity(n_states) - np.linalg.matrix_power( gamma * P_pi, horizon)), R_pi) - gamma * np.dot(P_pi, v_pi_s) pdb.set_trace() ground_truth_info = AttrDict({}) ground_truth_info.update({ 'd_pi': torch.tensor(dpi, dtype=dtype), 'd_mu': torch.tensor(dmu, dtype=dtype), 'v_pi': torch.tensor(v_pi_s, dtype=dtype), 'v_star': v_pi }) ground_truth_info.update({'w_pi': w_star}) ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)}) ground_truth_info.update({ 'pi': torch.tensor(pi, dtype=dtype), 'mu': torch.tensor(mu, dtype=dtype) }) true_rho = torch.tensor(pi / mu, dtype=dtype) true_rho[true_rho != true_rho] = 0 true_rho[torch.isinf(true_rho)] = 0 ground_truth_info.update({'rho': true_rho}) ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)}) ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)}) ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)}) ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)}) ground_truth_info.update({ 'd_pi_t': torch.tensor(dpi_t, dtype=dtype), 'd_mu_t': torch.tensor(dmu_t, dtype=dtype) }) estimate = {} squared_error = {} estimate.update({'True pi': [float(v_pi)]}) squared_error.update({'True pi': [0]}) estimate.update({'True mu': [float(v_mu)]}) squared_error.update({'True mu': [float(v_mu - v_pi)**2]}) #* Generate multiple sets of behavior data from mu training_data = [] training_data_processed = [] for _ in range(cfg.n_experiments): print('Experiment:', _) print('------------------------') np.random.seed(seed_list[_]) env.seed(seed_list[_]) # behavior_data = rollout(env, mu, processor, absorbing_state, pi_e = pi, N=n_trajectories, T=horizon, frameskip=1, frameheight=1, path=None, filename='tmp',) behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon) behavior_data_processed = prepare_behavior_data(behavior_data) training_data.append(behavior_data) training_data_processed.append(behavior_data_processed) # pdb.set_trace() estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[ 'STEP WIS'], estimate['Mu hat'] = [], [], [], [], [] squared_error['IS'] = [] squared_error['STEP IS'] = [] squared_error['WIS'] = [] squared_error['STEP WIS'] = [] squared_error['Mu hat'] = [] estimate['IH_SN'] = [] squared_error['IH_SN'] = [] estimate['IH_no_SN'] = [] squared_error['IH_no_SN'] = [] estimate['MB'] = [] squared_error['MB'] = [] ###* Looping over the number of baseline experiments for _ in range(cfg.n_experiments): behavior_data = training_data[_] behavior_data_processed = training_data_processed[_] IS = importance_sampling_estimator(behavior_data, mu, pi, gamma) step_IS = importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) WIS = weighted_importance_sampling_estimator(behavior_data, mu, pi, gamma) step_WIS = weighted_importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) estimate['IS'].append(float(IS)) squared_error['IS'].append(float((IS - v_pi)**2)) estimate['STEP IS'].append(float(step_IS)) squared_error['STEP IS'].append(float((step_IS - v_pi)**2)) estimate['WIS'].append(float(WIS)) squared_error['WIS'].append(float((WIS - v_pi)**2)) estimate['STEP WIS'].append(float(step_WIS)) squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2)) MB = model_based(n_states, n_actions, behavior_data, pi, gamma) estimate['MB'].append(float(MB)) squared_error['MB'].append(float((MB - v_pi)**2)) IH, IH_unnormalized = lihong_infinite_horizon(n_states, behavior_data, mu, pi, gamma) estimate['IH_SN'].append(float(IH)) squared_error['IH_SN'].append(float((IH - v_pi)**2)) estimate['IH_no_SN'].append(float(IH_unnormalized)) squared_error['IH_no_SN'].append(float((IH_unnormalized - v_pi)**2)) display((estimate, squared_error)) print('exp seed:', cfg.initial_seed) # pdb.set_trace() if RUN_SANITY_CHECK: #! Let's run some additional sanity check #* check to see if bias formula checks out v_w = 0 normalization = 0 for trajectory in behavior_data: discounted_t = 1 for s, a, sn, r in trajectory: v_w += w_star[s] * pi[s, a] / mu[s, a] * r * discounted_t normalization += discounted_t discounted_t *= gamma v_w = v_w / normalization on_policy_data, frequency, avg_reward = roll_out( env, pi, 4096, horizon) # pdb.set_trace() empirical_v_pi = np.zeros(n_states) empirical_d_pi = np.zeros(n_states) empirical_d0 = np.zeros(n_states) empirical_r_pi = np.zeros(n_states) empirical_frequency = np.zeros(n_states) empirical_P = np.zeros((n_states, n_actions, n_states)) horizon_normalization = (1 - gamma**horizon) / (1 - gamma) num_traj = len(on_policy_data) for trajectory in on_policy_data: discounted_t = 1 for s, a, sn, r in trajectory: empirical_v_pi[s] += r * discounted_t empirical_d_pi[s] += discounted_t # empirical_d0[s] += 1-discounted_t discounted_t *= gamma empirical_r_pi[s] += r empirical_frequency[s] += 1 empirical_P[s, a, sn] += 1 empirical_v_pi = empirical_v_pi / num_traj empirical_d_pi = empirical_d_pi / horizon_normalization / num_traj empirical_P = np.nan_to_num(empirical_P / np.sum(empirical_P, axis=-1)[:, :, None]) # T = np.nan_to_num(T/np.sum(T, axis = -1)[:,:,None]) empirical_r_pi = np.nan_to_num(empirical_r_pi / empirical_frequency) empirical_P_pi = np.einsum('san,sa->sn', empirical_P, pi) empirical_d_mu = np.zeros(n_states) num_traj = len(behavior_data) for trajectory in behavior_data: discounted_t = 1 for s, a, sn, r in trajectory: empirical_d_mu[s] += discounted_t discounted_t *= gamma empirical_d_mu = empirical_d_mu / horizon_normalization / num_traj empirical_w = np.nan_to_num(empirical_d_pi / empirical_d_mu) empirical_loss = L(empirical_w, empirical_v_pi) empirical_bellman_original = 0 empirical_bellman_new = 0 empirical_td_error = 0 num_traj = len(on_policy_data) empirical_r_pi_adjusted = np.dot( (np.identity(n_states) - np.linalg.matrix_power(gamma * empirical_P_pi, horizon)), empirical_r_pi) for trajectory in on_policy_data: discounted_t = 1.0 for s, a, sn, r in trajectory: empirical_bellman_original += discounted_t * ( v_pi_s[s] - empirical_r_pi[s] - gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2 empirical_bellman_new += discounted_t * ( v_pi_s[s] - empirical_r_pi_adjusted[s] - gamma * np.dot(empirical_P_pi[s, :], v_pi_s))**2 empirical_td_error += discounted_t * (v_pi_s[s] - r - gamma * v_pi_s[sn])**2 discounted_t *= gamma empirical_td_error = empirical_td_error / horizon_normalization / num_traj empirical_bellman_original = empirical_bellman_original / horizon_normalization / num_traj empirical_bellman_new = empirical_bellman_new / horizon_normalization / num_traj # empirical_bellman_original = empirical_v_pi - empirical_r_pi - gamma*np.dot(empirical_P_pi, empirical_v_pi) # bellman_original = v_pi_s - R_pi - gamma * np.dot(P_pi, v_pi_s) # bellman_new = v_pi_s - np.dot((np.identity(n_states) - np.linalg.matrix_power(gamma*P_pi, horizon)),R_pi) - gamma*np.dot(P_pi, v_pi_s) pdb.set_trace() for objective in cfg.objective: estimate[objective] = [] squared_error[objective] = [] objective_sn = objective + '-SN' estimate[objective_sn] = [] squared_error[objective_sn] = [] for i in range(cfg.n_experiments): training_set = training_data_processed[i] fixed_terminal_value = True logging = cfg.logging mvm = Tabular_State_MVM_Estimator(training_set, cfg, logging=logging, ground_truth=ground_truth_info) penalty = cfg.penalty_input horizon_normalization = (1 - gamma**horizon) / (1 - gamma) # penalty_base = 1/mdp_calculator.horizon_normalization#/cfg.n_trajectories penalty_base = 1 / horizon_normalization mvm.set_random_seed( seed_list[i]) #different random seed per experiment mvm.solve_closed_form_bias() mvm.generate_random_v_class(cfg.v_class_cardinality) mvm.generate_random_w_class(cfg.v_class_cardinality) # mvm.bias_check() for objective in cfg.objective: mvm.set_random_seed(seed_list[i]) # w_estimator = mvm.optimize_finite_class(objective = objective, td_penalty=penalty*penalty_base) # w_estimator = mvm.optimize_discrete(objective = objective, td_penalty=penalty*penalty_base) w_estimator = mvm.optimize(objective, td_penalty=0.1) # w_estimator, w_estimator_sn = mvm.optimize_optimistic() # w_estimator, w_estimator_sn = mvm.optimize_optimistic_adam(objective = objective, td_penalty=penalty*penalty_base) # w_estimator = mvm.optimize_closed_form() estimate[objective].append(float(w_estimator)) # objective_sn = objective + '-SN' # estimate[objective_sn].append(float(w_estimator_sn)) squared_error[objective].append(float(w_estimator - v_pi)**2) # squared_error[objective_sn].append(float(w_estimator_sn-v_pi)**2) display((estimate, squared_error)) display((estimate, squared_error))
# -*- coding: utf-8 -*- from gridworld import GridWorldEnv from gym import spaces env = GridWorldEnv(n_width=12, n_height=4, u_size=60, default_reward=-1, default_type=0, windy=False) env.action_space = spaces.Discrete(4) # set action space num env.start = (0, 0) env.ends = [(11, 0)] # set cliff for i in range(10): env.rewards.append((i+1, 0, -100)) # set special reward env.ends.append((i+1, 0)) # set cliff all terminal states # make set take effect env.refresh_setting() # 环境初始化 env.reset() # UI show env.render() # input("press any key to continue") for _ in range(20000): env.render() a = env.action_space.sample() state, reward, isdone, info = env.step(a) print("{0}, {1}, {2}, {3}".format(a, reward, isdone, info)) print("env closed")
import numpy as np from gridworld import GridWorldEnv env = GridWorldEnv() def value_iteration(env, theta=0.0001, discount_factor=1.0): """ Value Iteration Algorithm. Args: env: OpenAI environment. env.P represents the transition probabilities of the environment. theta: Stopping threshold. If the value of all states changes less than theta in one iteration we are done. discount_factor: lambda time discount factor. Returns: A tuple (policy, V) of the optimal policy and the optimal value function. """ def one_step_lookahead(state, V): """ Helper function to calculate the value for all action in a given state. Args: state: The state to consider (int) V: The value to use as an estimator, Vector of length env.nS Returns: A vector of length env.nA containing the expected value of each action. """ A = np.zeros(env.nA)
def main(cfg): initial_seed = cfg.initial_seed random.seed(initial_seed) np.random.seed(initial_seed) gamma = cfg.gamma # n_trajectories_list = cfg.n_trajectories # for n_trajectories in n_trajectories_list: # n_trajectories = cfg.n_trajectories horizon = cfg.horizon horizon_normalization = (1 - gamma**horizon) / ( 1 - gamma) if gamma < 1 else horizon seed_list = [ initial_seed + np.random.randint(0, 10000) * i for i in range(cfg.n_experiments) ] # generate a list of random seeds if cfg.env == 'grid_world': from gridworld import GridWorldEnv env = GridWorldEnv() elif cfg.env == 'taxi': from taxi import TaxiEnv env = TaxiEnv() n_states = env.nS n_actions = env.nA P = env.P_matrix R = env.R_matrix.copy() d0 = env.isd q_star_original = env.value_iteration() pi = env.extract_policy(q_star_original, temperature=0.3) mu = env.extract_policy(q_star_original, temperature=0.1) # pi = env.extract_policy(q_star_original, temperature=0.1) # mu = env.extract_policy(q_star_original, temperature=0.3) # pi = env.extract_policy(q_star_original, temperature=0.3) # mu = env.extract_policy(q_star_original, temperature=0.15) # mu = pi.copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy() #* 4 swapped cyclic # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,0].copy() #* D swapped with R, L swapped with U # mu = pi.copy(); mu[:,0] = pi[:,3].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,1].copy(); mu[:,3] = pi[:,0].copy() # mu = pi.copy(); mu[:,0] = pi[:,1].copy(); mu[:,1] = pi[:,2].copy(); mu[:,2] = pi[:,3].copy(); mu[:,3] = pi[:,4].copy();mu[:,4] = pi[:,5].copy();mu[:,5] = pi[:,0].copy() dpi, dpi_t, v_pi_s, q_pi_sa, P_pi = exact_calculation( env, pi, cfg.horizon, cfg.gamma) dmu, dmu_t, vmu_s, qmu_sa, P_mu = exact_calculation( env, mu, cfg.horizon, cfg.gamma) w_star = np.nan_to_num(dpi / dmu) v_pi = np.sum(d0 * v_pi_s) v_mu = np.sum(d0 * vmu_s) dpi_H = np.dot(P_pi.T, dpi_t[:, horizon - 1]) dmu_H = np.dot(P_mu.T, dmu_t[:, horizon - 1]) ground_truth_info = AttrDict({}) ground_truth_info.update({ 'd_pi': torch.tensor(dpi, dtype=dtype), 'd_mu': torch.tensor(dmu, dtype=dtype), 'v_pi': torch.tensor(v_pi_s, dtype=dtype), 'q_pi': torch.tensor(q_pi_sa, dtype=dtype), 'v_star': v_pi }) ground_truth_info.update({'w_pi': w_star}) ground_truth_info.update({'P': torch.tensor(env.P_matrix, dtype=dtype)}) ground_truth_info.update({ 'pi': torch.tensor(pi, dtype=dtype), 'mu': torch.tensor(mu, dtype=dtype) }) true_rho = torch.tensor(pi / mu, dtype=dtype) true_rho[true_rho != true_rho] = 0 true_rho[torch.isinf(true_rho)] = 0 ground_truth_info.update({'rho': true_rho}) ground_truth_info.update({'d0': torch.tensor(env.isd, dtype=dtype)}) ground_truth_info.update({'R': torch.tensor(env.R_matrix, dtype=dtype)}) ground_truth_info.update({'d_pi_H': torch.tensor(dpi_H, dtype=dtype)}) ground_truth_info.update({'d_mu_H': torch.tensor(dmu_H, dtype=dtype)}) ground_truth_info.update({ 'd_pi_t': torch.tensor(dpi_t, dtype=dtype), 'd_mu_t': torch.tensor(dmu_t, dtype=dtype) }) estimate = {} squared_error = {} estimate.update({'True pi': [float(v_pi)]}) squared_error.update({'True pi': [0]}) estimate.update({'True mu': [float(v_mu)]}) squared_error.update({'True mu': [float(v_mu - v_pi)**2]}) results = {} results['trajectories'] = [] results['IS'] = [] results['IH'] = [] results['MB'] = [] results['WIS'] = [] results['STEP WIS'] = [] results['STEP IS'] = [] results['True mu'] = [] for objective in cfg.objective: results[objective] = [] n_trajectories_list = cfg.n_trajectories for n_trajectories in n_trajectories_list: print('------------------------') #* Generate multiple sets of behavior data from mu training_data = [] training_data_processed = [] for _ in range(cfg.n_experiments): # print('Experiment:',_) # print('------------------------') np.random.seed(seed_list[_]) env.seed(seed_list[_]) behavior_data, _, _ = roll_out(env, mu, n_trajectories, horizon) behavior_data_processed = prepare_behavior_data(behavior_data) training_data.append(behavior_data) training_data_processed.append(behavior_data_processed) estimate['IS'], estimate['STEP IS'], estimate['WIS'], estimate[ 'STEP WIS'], estimate['Mu hat'] = [], [], [], [], [] squared_error['IS'] = [] squared_error['STEP IS'] = [] squared_error['WIS'] = [] squared_error['STEP WIS'] = [] squared_error['Mu hat'] = [] estimate['IH_SN'] = [] squared_error['IH_SN'] = [] estimate['IH_no_SN'] = [] squared_error['IH_no_SN'] = [] estimate['MB'] = [] squared_error['MB'] = [] ###* Looping over the number of baseline experiments for _ in range(cfg.n_experiments): behavior_data = training_data[_] behavior_data_processed = training_data_processed[_] IS = importance_sampling_estimator(behavior_data, mu, pi, gamma) step_IS = importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) WIS = weighted_importance_sampling_estimator( behavior_data, mu, pi, gamma) step_WIS = weighted_importance_sampling_estimator_stepwise( behavior_data, mu, pi, gamma) estimate['IS'].append(float(IS)) squared_error['IS'].append(float((IS - v_pi)**2)) estimate['STEP IS'].append(float(step_IS)) squared_error['STEP IS'].append(float((step_IS - v_pi)**2)) estimate['WIS'].append(float(WIS)) squared_error['WIS'].append(float((WIS - v_pi)**2)) estimate['STEP WIS'].append(float(step_WIS)) squared_error['STEP WIS'].append(float((step_WIS - v_pi)**2)) MB = model_based(n_states, n_actions, behavior_data, pi, gamma) estimate['MB'].append(float(MB)) squared_error['MB'].append(float((MB - v_pi)**2)) IH, IH_unnormalized = lihong_infinite_horizon( n_states, behavior_data, mu, pi, gamma) estimate['IH_SN'].append(float(IH)) squared_error['IH_SN'].append(float((IH - v_pi)**2)) estimate['IH_no_SN'].append(float(IH_unnormalized)) squared_error['IH_no_SN'].append(float( (IH_unnormalized - v_pi)**2)) # display((estimate, squared_error)) # print('exp seed:', cfg.initial_seed) # pdb.set_trace() results['trajectories'].append(np.log2(n_trajectories)) results['IH'].append( np.log2( sum(squared_error['IH_SN']) / len(squared_error['IH_SN']) / v_pi**2)) results['MB'].append( np.log2( sum(squared_error['MB']) / len(squared_error['IH_SN']) / v_pi**2)) results['IS'].append( np.log2( sum(squared_error['IS']) / len(squared_error['IS']) / v_pi**2)) results['WIS'].append( np.log2( sum(squared_error['WIS']) / len(squared_error['WIS']) / v_pi**2)) results['STEP WIS'].append( np.log2( sum(squared_error['STEP WIS']) / len(squared_error['STEP WIS']) / v_pi**2)) results['STEP IS'].append( np.log2( sum(squared_error['STEP IS']) / len(squared_error['STEP IS']) / v_pi**2)) results['True mu'].append( np.log2( sum(squared_error['True mu']) / len(squared_error['True mu']) / v_pi**2)) for objective in cfg.objective: estimate[objective] = [] squared_error[objective] = [] # for i in range(cfg.n_experiments): # training_set = training_data_processed[i] # mvm = Tabular_State_MVM_Estimator(training_set, cfg, ground_truth = ground_truth_info) # for objective in cfg.objective: # mvm.set_random_seed(seed_list[i]) # w_estimator = mvm.optimize(objective) # estimate[objective].append(float(w_estimator)) # squared_error[objective].append(float(w_estimator-v_pi)**2) # display((estimate, squared_error)) for i in range(cfg.n_experiments): training_set = training_data_processed[i] mvm = Tabular_State_MVM_Estimator(training_set, cfg, ground_truth=ground_truth_info) for objective in cfg.objective: mvm.set_random_seed(seed_list[i]) w_estimator = mvm.optimize(objective) estimate[objective].append(float(w_estimator)) squared_error[objective].append(float(w_estimator - v_pi)**2) # display((estimate, squared_error)) for objective in cfg.objective: results[objective].append( np.log2( sum(squared_error[objective]) / len(squared_error[objective]) / v_pi**2)) display((estimate, squared_error), n_trajectories) print('\n') print('End of one set of experiments') # pdb.set_trace() df = pd.DataFrame(results) # plt.plot(results['trajectories'], results['IH'],marker='o', markerfacecolor='blue', markersize=12, color='blue', linewidth=4) # plt.plot(results['trajectories'], results['MB'],marker='o', markerfacecolor='red', markersize=12, color='red', linewidth=4) # plt.plot(results['trajectories'], results['STEP WIS'],marker='o', markerfacecolor='aqua', markersize=12, color='aqua', linewidth=4) # plt.plot(results['trajectories'], results['STEP IS'],marker='o', markerfacecolor='orange', markersize=12, color='orange', linewidth=4) markersize = 8 linewidth = 4 plt.plot('trajectories', 'STEP WIS', data=df, marker='o', markerfacecolor='slategrey', markersize=markersize, color='slategrey', linewidth=linewidth) plt.plot('trajectories', 'STEP IS', data=df, marker='o', markerfacecolor='rosybrown', markersize=markersize, color='rosybrown', linewidth=linewidth) plt.plot('trajectories', 'True mu', data=df, marker='o', markerfacecolor='black', markersize=markersize, color='black', linewidth=linewidth) # plt.plot('trajectories', 'MWL', data=df, marker='o', markerfacecolor='green', markersize=markersize, color='green', linewidth=linewidth) # plt.plot('trajectories', 'LSTDQ', data=df, marker='o', markerfacecolor='olive', markersize=markersize, color='olive', linewidth=linewidth) plt.plot('trajectories', 'IH', data=df, marker='o', markerfacecolor='purple', markersize=markersize, color='purple', linewidth=linewidth) plt.plot('trajectories', 'MB', data=df, marker='o', markerfacecolor='gold', markersize=markersize, color='gold', linewidth=linewidth) plt.plot('trajectories', 'TD-ball center', data=df, marker='p', markerfacecolor='cadetblue', markersize=markersize, color='cadetblue', linewidth=linewidth) plt.plot('trajectories', 'bias', data=df, marker='s', markerfacecolor='skyblue', markersize=markersize, color='skyblue', linewidth=linewidth) plt.plot('trajectories', 'bias_td', data=df, marker='s', markerfacecolor='darkred', markersize=markersize, color='darkred', linewidth=linewidth) plt.plot('trajectories', 'bias_td_var', data=df, marker='s', markerfacecolor='orange', markersize=markersize, color='orange', linewidth=linewidth) # plt.xticks(cfg.n_trajectories) plt.xticks(results['trajectories']) plt.xlabel('log number of trajectories') plt.ylabel('log MSE') plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, prop={'size': 8}) plt.savefig('pi_03_mu_01_grid_misspecified_w.png') pdb.set_trace()