示例#1
0
def cartpole_sampling(theta, cm, K, Ke, N, epsilon):
    theta_list = np.random.multivariate_normal(theta, cm, K)
    result_list = []
    for x in range(K):
        # concurrent_eval(theta_list, x, result_list, N)
        avg_reward = 0
        for i in range(N):
            cartpole = CartPole()
            cartpole.pi_params = theta_list[x].reshape(4, 2)
            epi = CartPoleEpisode(cartpole)
            avg_reward += epi.run_all_steps()
        result_list.append((theta_list[x], avg_reward / N))

    # print(sorted(result_list, key=lambda n: n[-1], reverse=True))
    elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke]
    # print(elite_list)
    theta_final = np.zeros(8)
    cm_final = epsilon * np.identity(8)
    J_final = 0
    for t in elite_list:
        theta_final += t[0]
        cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta]))
        J_final += t[1]
    theta_final /= Ke
    cm_final /= (epsilon + Ke)
    # print(cm_final)
    J_final /= Ke
    return theta_final, cm_final, J_final
def cartpole_evaluate(table, N):
    avg_reward = 0
    for i in range(N):
        cartpole = CartPole()
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        avg_reward += epi.run_all_steps()
    return avg_reward / N
def multi_cartpole_episode(table, l):
    for i in l:
        cartpole = CartPole()
        # print(i)
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        cp_q.put(epi.run_all_steps())
    return 0
def cartpole_evaluate(t, N):
    reward_l = []
    for i in range(N):
        cartpole = CartPole()
        # print(i)
        cartpole.pi_params = t.reshape(4, 2)
        epi = CartPoleEpisode(cartpole)
        reward_l.append(epi.run_all_steps())

    return sum(reward_l) / N