def _simulation(tmp_env, ini_theta, alpha, beta, batch_size, trajectory_length=50000, gamma=0.95, target=None): env = gym.make("FrozenLake-v0") env.reset() current_state = 0 gq = GreedyGQ_Base(tmp_env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) gq.set_theta(ini_theta) for i in range(trajectory_length): random_action = env.action_space.sample() new_state, reward, done, info = env.step(random_action) next_state = new_state action = random_action gq.update(current_state, reward, next_state, action) if done: env.reset() current_state = 0 return gq.theta[0]
def _simulation(env, ini_theta, alpha, beta, batch_size, trajectory_length=50000, gamma=0.95, target=None): train_start = time.time() env.reset() current_state = env.current_state gq = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) gq.set_theta(ini_theta) vrgq = VRGreedyGQ(env, batch_size=batch_size, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) vrgq.set_theta(ini_theta) gq_var = [] vrgq_var = [] count = 1 num_sample_mc = 500 for i in range(trajectory_length): next_state, reward, action = env.step() gq.update(current_state, reward, next_state, action) vrgq.update(current_state, reward, next_state, action) if i >= batch_size: estimated_var = 0.0 tmp_theta = np.copy(vrgq.theta) tmp_w = np.copy(vrgq.omega) true_grad = evaluate_J(env, tmp_theta) for ddd in range(num_sample_mc): ss, aa, next_ss, rr = env.sample() grad_theta, grad_omega = vrgq.get_grad(ss, rr, next_ss, aa) estimated_var += np.sum((grad_theta - true_grad) ** 2) / num_sample_mc vrgq_var.append(estimated_var) estimated_var = 0.0 tmp_theta = np.copy(gq.theta[0]) tmp_w = np.copy(gq.omega) true_grad = evaluate_J(env, tmp_theta) for ddd in range(num_sample_mc): ss, aa, next_ss, rr = env.sample() grad_theta, grad_omega = gq._extract_grad_info(ss, rr, next_ss, aa) estimated_var += np.sum((grad_theta - true_grad) ** 2) / num_sample_mc gq_var.append(estimated_var) if (i + 1) % 10000 == 0: print("Current iteration:", i + 1, ". Time Spent:", time.time() - train_start) train_start = time.time() count += 1 current_state = np.copy(next_state) return gq_var, vrgq_var
def _easy_simulation(env, alpha, beta, batch_size, trajectory_length=50000, num_simulation=100, gamma=0.95, target=None): ini_start = time.time() print("Initialization...") ini_theta = np.random.normal(scale=3.0, size=env.num_features) print("Initialization Completed. Time Spent:", time.time() - ini_start) stationary = compute_stationary_dist(env.trans_kernel) all_gq_hist_min = [] all_vrgq_hist_min = [] all_gq_hist_last = [] all_vrgq_hist_last = [] all_gq_hist_avg = [] all_vrgq_hist_avg = [] for _ in range(num_simulation): env.reset() current_state = env.current_state estimate1 = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) estimate1.set_theta(ini_theta) estimate2 = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) estimate2.set_theta(ini_theta) gq = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) gq.set_theta(ini_theta) vrgq = VRGreedyGQ(env, batch_size=batch_size, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) vrgq.set_theta(ini_theta) print("Start Training. Simulation:", _ + 1) train_start = time.time() gq_hist_last = [evaluate_J(env, gq.theta)] vrgq_hist_last = [evaluate_J(env, gq.theta)] gq_hist_min = [evaluate_J(env, gq.theta)] vrgq_hist_min = [evaluate_J(env, gq.theta)] count = 1 for i in range(trajectory_length): next_state, reward, action = env.step() grad_theta_gq, grad_omega_gq = gq.update(current_state, reward, next_state, action) gq_hist_min.append( np.min(gq_hist_min + [evaluate_J(env, gq.theta[0])])) gq_hist_last.append(evaluate_J(env, gq.theta[0])) grad_theta_vrgq, grad_omega_vrgq = vrgq.update( current_state, reward, next_state, action) vrgq_hist_min.append( np.min(vrgq_hist_min + [evaluate_J(env, vrgq.theta)])) vrgq_hist_last.append(evaluate_J(env, vrgq.theta)) # Estimate the variance grad_theta = np.zeros_like(grad_theta_gq) grad_omega = np.zeros_like(grad_omega_gq) estimate1.set_theta(gq.theta) estimate1.set_omega(gq.omega) estimate2.set_theta(vrgq.theta) estimate2.set_omega(vrgq.omega) for sss in env.state_space: pass current_state = np.copy(next_state) if (i + 1) % 10000 == 0: print("Current iteration:", i + 1, ". Time Spent:", time.time() - train_start) train_start = time.time() count += 1 all_gq_hist_min.append(gq_hist_min) all_vrgq_hist_min.append(vrgq_hist_min) all_gq_hist_last.append(gq_hist_last) all_vrgq_hist_last.append(vrgq_hist_last) return all_gq_hist_last, all_vrgq_hist_last
def _simulation(env, ini_theta, alpha, beta, batch_size, trajectory_length=50000, gamma=0.95, target=None): train_start = time.time() env.reset() current_state = env.current_state pg = PolicyGradient(env, eta_theta=alpha, gamma=gamma, is_on_policy=False) pg.set_theta(ini_theta) gq = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) gq.set_theta(ini_theta) vrgq = VRGreedyGQ(env, batch_size=batch_size, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) vrgq.set_theta(ini_theta) env2 = gym.make("FrozenLake-v0", is_slippery=False) env2.reset() current_state2 = 0 pppp = SoftmaxPolicy(gq.theta, env, 1.0) N = 1000 r = 0 for iii in range(N): random_action = pppp.get_action(current_state2) new_state, reward, done, info = env2.step(random_action) if done: env2.reset() current_state2 = 0 else: current_state2 = new_state r += reward r /= 1000.0 r_gq = [np.copy(r)] r_vrgq = [np.copy(r)] r_pg = [np.copy(r)] for i in range(trajectory_length): next_state, reward, action = env.step() pg.update(current_state, action) if i % 1000 == 0: env2 = gym.make("FrozenLake-v0", is_slippery=False) env2.reset() current_state2 = 0 pppp = SoftmaxPolicy(pg.theta, env, 1.0) N = 1000 r = 0 for iii in range(N): random_action = pppp.get_action(current_state2) new_state, reward, done, info = env2.step(random_action) if done: env2.reset() current_state2 = 0 else: current_state2 = new_state r += reward r /= 1000.0 r_pg.append(np.max(r_pg + [np.copy(r)])) gq.update(current_state, reward, next_state, action) if i % 1000 == 0: env2 = gym.make("FrozenLake-v0", is_slippery=False) env2.reset() current_state2 = 0 pppp = SoftmaxPolicy(gq.theta, env, 1.0) N = 1000 r = 0 for iii in range(N): random_action = pppp.get_action(current_state2) new_state, reward, done, info = env2.step(random_action) if done: env2.reset() current_state2 = 0 else: current_state2 = new_state r += reward r /= 1000.0 r_gq.append(np.max(r_gq + [np.copy(r)])) vrgq.update(current_state, reward, next_state, action) if i > batch_size: if i % 1000 == 0: env2 = gym.make("FrozenLake-v0", is_slippery=False) env2.reset() current_state2 = 0 pppp = SoftmaxPolicy(vrgq.theta, env, 1.0) N = 1000 r = 0 for iii in range(N): random_action = pppp.get_action(current_state2) new_state, reward, done, info = env2.step(random_action) if done: env2.reset() current_state2 = 0 else: current_state2 = new_state r += reward r /= 1000.0 r_vrgq.append(np.max(r_vrgq + [np.copy(r)])) if (i + 1) % 10000 == 0: print("Current iteration:", i + 1, ". Time Spent:", time.time() - train_start) train_start = time.time() current_state = np.copy(next_state) print("Done") return r_pg, r_gq, r_vrgq
def _simulation(env, ini_theta, alpha, beta, batch_size, trajectory_length=50000, gamma=0.95, target=None): train_start = time.time() env.reset() current_state = env.current_state pg = ActorCritic(env, eta_theta=alpha, eta_omega=beta, gamma=gamma, is_on_policy=False) pg.set_theta(ini_theta) gq = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) gq.set_theta(ini_theta) vrgq = VRGreedyGQ(env, batch_size=batch_size, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) vrgq.set_theta(ini_theta) tmpp_env = env.get_copy() pppp = SoftmaxPolicy(pg.theta, tmpp_env, 1.0) ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action)) for ssss in tmpp_env.state_space: for aaaa in tmpp_env.action_space: ppp[ssss, aaaa] = pppp.policy(aaaa, ssss) tmpp_env.set_behavior_policy(ppp) stat = compute_stationary_dist(tmpp_env.trans_kernel) r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat) r_pg = [np.copy(r)] r_gq = [np.copy(r)] r_vrgq = [np.copy(r)] for i in range(trajectory_length): next_state, reward, action = env.step() pg.update(current_state, action) if i % 10 == 0: tmpp_env = env.get_copy() pppp = SoftmaxPolicy(pg.theta, tmpp_env, 1.0) ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action)) for ssss in tmpp_env.state_space: for aaaa in tmpp_env.action_space: _tmp = pppp.policy(aaaa, ssss) ppp[ssss, aaaa] = _tmp[0] tmpp_env.set_behavior_policy(ppp) stat = compute_stationary_dist(tmpp_env.trans_kernel) r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat) r_pg.append(np.max(r_pg + [np.copy(r)])) gq.update(current_state, reward, next_state, action) if i % 10 == 0: tmpp_env = env.get_copy() pppp = SoftmaxPolicy(gq.theta, tmpp_env, 1.0) ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action)) for ssss in tmpp_env.state_space: for aaaa in tmpp_env.action_space: ppp[ssss, aaaa] = pppp.policy(aaaa, ssss) tmpp_env.set_behavior_policy(ppp) stat = compute_stationary_dist(tmpp_env.trans_kernel) r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat) r_gq.append(np.max(r_gq + [np.copy(r)])) vrgq.update(current_state, reward, next_state, action) if i % 10 == 0: tmpp_env = env.get_copy() pppp = SoftmaxPolicy(vrgq.theta, tmpp_env, 1.0) ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action)) for ssss in tmpp_env.state_space: for aaaa in tmpp_env.action_space: ppp[ssss, aaaa] = pppp.policy(aaaa, ssss) tmpp_env.set_behavior_policy(ppp) stat = compute_stationary_dist(tmpp_env.trans_kernel) r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat) if i >= vrgq.batch_size: r_vrgq.append(np.max(r_vrgq + [np.copy(r)])) if (i + 1) % 10000 == 0: print("Current iteration:", i + 1, ". Time Spent:", time.time() - train_start) train_start = time.time() current_state = np.copy(next_state) return r_pg, r_gq, r_vrgq
def _simulation(tmp_env, ini_theta, alpha, beta, batch_size, trajectory_length=50000, gamma=0.95, target=None): env = gym.make("FrozenLake-v0", is_slippery=False) env.reset() current_state = 0 gq = GreedyGQ_Base(tmp_env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) gq.set_theta(ini_theta) vrgq = VRGreedyGQ(tmp_env, batch_size=batch_size, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) vrgq.set_theta(ini_theta) gq_hist_avg = [evaluate_J_obj(tmp_env, gq.theta)] vrgq_hist_avg = [evaluate_J_obj(tmp_env, gq.theta)] env2 = gym.make("FrozenLake-v0", is_slippery=False) env2.reset() current_state2 = 0 pppp = SoftmaxPolicy(gq.theta, tmp_env, 1.0) N = 1000 r = 0 for iii in range(N): random_action = pppp.get_action(current_state2) new_state, reward, done, info = env2.step(random_action) if done: env2.reset() current_state2 = 0 else: current_state2 = new_state r += reward r /= 1000.0 r_gq = [np.copy(r)] r_vrgq = [np.copy(r)] for i in range(trajectory_length): random_action = env.action_space.sample() new_state, reward, done, info = env.step(random_action) next_state = new_state action = random_action gq.update(current_state, reward, next_state, action) if i % 1000 == 0: # gq_hist_avg.append((len(gq_hist_avg) * gq_hist_avg[-1] + evaluate_J(tmp_env, gq.theta[0])) /(len(gq_hist_avg) + 1)) gq_hist_avg.append( np.min(gq_hist_avg + [evaluate_J_obj(tmp_env, gq.theta[0])])) env2 = gym.make("FrozenLake-v0", is_slippery=False) env2.reset() current_state2 = 0 pppp = SoftmaxPolicy(gq.theta, tmp_env, 1.0) N = 1000 r = 0 for iii in range(N): random_action = pppp.get_action(current_state2) new_state, reward, done, info = env2.step(random_action) if done: env2.reset() current_state2 = 0 else: current_state2 = new_state r += reward r /= 1000.0 r_gq.append(np.max(r_gq + [np.copy(r)])) vrgq.update(current_state, reward, next_state, action) if i > batch_size: if i % 1000 == 0: # vrgq_hist_avg.append((len(vrgq_hist_avg) * vrgq_hist_avg[-1] + evaluate_J(tmp_env, vrgq.theta)) /(len(vrgq_hist_avg) + 1)) vrgq_hist_avg.append( np.min(vrgq_hist_avg + [evaluate_J_obj(tmp_env, vrgq.theta)])) env2 = gym.make("FrozenLake-v0", is_slippery=False) env2.reset() current_state2 = 0 pppp = SoftmaxPolicy(vrgq.theta, tmp_env, 1.0) N = 1000 r = 0 for iii in range(N): random_action = pppp.get_action(current_state2) new_state, reward, done, info = env2.step(random_action) if done: env2.reset() current_state2 = 0 else: current_state2 = new_state r += reward r /= 1000.0 r_vrgq.append(np.max(r_vrgq + [np.copy(r)])) if done: env.reset() current_state = 0 return gq_hist_avg, vrgq_hist_avg, r_gq, r_vrgq
def _simulation(env, ini_theta, alpha, beta, batch_size, trajectory_length=50000, gamma=0.95, target=None): train_start = time.time() env.reset() current_state = env.current_state gq = GreedyGQ_Base(env, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) gq.set_theta(ini_theta) vrgq = VRGreedyGQ(env, batch_size=batch_size, target_policy=target, eta_theta=alpha, eta_omega=beta, gamma=gamma) vrgq.set_theta(ini_theta) gq_hist_avg = [evaluate_J(env, gq.theta)] vrgq_hist_avg = [evaluate_J(env, gq.theta)] tmpp_env = env.get_copy() pppp = SoftmaxPolicy(gq.theta, tmpp_env, 1.0) ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action)) for ssss in tmpp_env.state_space: for aaaa in tmpp_env.action_space: ppp[ssss, aaaa] = pppp.policy(aaaa, ssss) tmpp_env.set_behavior_policy(ppp) stat = compute_stationary_dist(tmpp_env.trans_kernel) r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat) r_gq = [np.copy(r)] r_vrgq = [np.copy(r)] aaaaaaa = evaluate_J_obj(env, gq.theta) obj_gq = [np.copy(aaaaaaa)] obj_vrgq = [np.copy(aaaaaaa)] count = 1 best_gq_obj = np.copy(aaaaaaa) gq_best_theta = np.copy(gq.theta) best_vrgq_obj = np.copy(aaaaaaa) vrgq_best_theta = np.copy(gq.theta) for i in range(trajectory_length): next_state, reward, action = env.step() gq.update(current_state, reward, next_state, action) if i % 10 == 0: tmpp_env = env.get_copy() pppp = SoftmaxPolicy(gq.theta, tmpp_env, 1.0) ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action)) for ssss in tmpp_env.state_space: for aaaa in tmpp_env.action_space: ppp[ssss, aaaa] = pppp.policy(aaaa, ssss) tmpp_env.set_behavior_policy(ppp) stat = compute_stationary_dist(tmpp_env.trans_kernel) r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat) r_gq.append(np.max(r_gq + [np.copy(r)])) gq_hist_avg.append( np.min(gq_hist_avg + [evaluate_J(env, gq.theta[0])])) obj_tmp = evaluate_J_obj(env, gq.theta.transpose())[0] if obj_tmp < best_gq_obj: best_gq_obj = obj_tmp vrgq_best_theta = gq.theta[0] if i > 100: obj_gq.append(np.min(obj_gq + [np.copy(obj_tmp)])) #gq_hist_avg.append( evaluate_J(env, gq.theta[0]) ) #gq_hist_avg.append( # (len(gq_hist_avg) * gq_hist_avg[-1] + evaluate_J(env, gq.theta[0])) / (len(gq_hist_avg) + 1)) vrgq.update(current_state, reward, next_state, action) if i > batch_size: if i % 10 == 0: tmpp_env = env.get_copy() pppp = SoftmaxPolicy(vrgq.theta, tmpp_env, 1.0) ppp = np.zeros((tmpp_env.num_state, tmpp_env.num_action)) for ssss in tmpp_env.state_space: for aaaa in tmpp_env.action_space: ppp[ssss, aaaa] = pppp.policy(aaaa, ssss) tmpp_env.set_behavior_policy(ppp) stat = compute_stationary_dist(tmpp_env.trans_kernel) r = get_total_reward(reward=tmpp_env.reward, stationary_dist=stat) r_vrgq.append(np.max(r_vrgq + [np.copy(r)])) #vrgq_hist_avg.append((len(vrgq_hist_avg) * vrgq_hist_avg[-1] + evaluate_J(env, vrgq.theta)) /(len(vrgq_hist_avg) + 1)) vrgq_hist_avg.append( np.min(vrgq_hist_avg + [evaluate_J(env, vrgq.theta)])) #vrgq_hist_avg.append( evaluate_J(env, vrgq.theta)) obj_tmp = evaluate_J_obj(env, vrgq.theta) if obj_tmp < best_gq_obj: best_vrgq_obj = obj_tmp vrgq_best_theta = vrgq.theta if i > 100: obj_vrgq.append(np.min(obj_vrgq + [np.copy(obj_tmp)])) if (i + 1) % 10000 == 0: print("Current iteration:", i + 1, ". Time Spent:", time.time() - train_start) train_start = time.time() count += 1 current_state = np.copy(next_state) return gq_hist_avg, vrgq_hist_avg, r_gq, r_vrgq, obj_gq, obj_vrgq