def cartpole_sampling(theta, cm, K, Ke, N, epsilon): theta_list = np.random.multivariate_normal(theta, cm, K) result_list = [] for x in range(K): # concurrent_eval(theta_list, x, result_list, N) avg_reward = 0 for i in range(N): cartpole = CartPole() cartpole.pi_params = theta_list[x].reshape(4, 2) epi = CartPoleEpisode(cartpole) avg_reward += epi.run_all_steps() result_list.append((theta_list[x], avg_reward / N)) # print(sorted(result_list, key=lambda n: n[-1], reverse=True)) elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke] # print(elite_list) theta_final = np.zeros(8) cm_final = epsilon * np.identity(8) J_final = 0 for t in elite_list: theta_final += t[0] cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta])) J_final += t[1] theta_final /= Ke cm_final /= (epsilon + Ke) # print(cm_final) J_final /= Ke return theta_final, cm_final, J_final
def cartpole_evaluate(table, N): avg_reward = 0 for i in range(N): cartpole = CartPole() cartpole.pi_params = table epi = CartPoleEpisode(cartpole) avg_reward += epi.run_all_steps() return avg_reward / N
def multi_cartpole_episode(table, l): for i in l: cartpole = CartPole() # print(i) cartpole.pi_params = table epi = CartPoleEpisode(cartpole) cp_q.put(epi.run_all_steps()) return 0
def cartpole_evaluate(t, N): reward_l = [] for i in range(N): cartpole = CartPole() # print(i) cartpole.pi_params = t.reshape(4, 2) epi = CartPoleEpisode(cartpole) reward_l.append(epi.run_all_steps()) return sum(reward_l) / N