示例#1
0
def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO
    print("Problem 4")

    m = 4
    numActions = 2

    numTrials = 50
    numEps = 10
    numIters = 30
    popSize = 10
    numElite = 5

    epsilon = 1.25
    sigma = 0.1
    k = 3
    policyEval = CartPoleEvaluation(k=k)
    # print ("Size of theta = ", numActions*np.power(k+1, m))
    agent = CEM(np.zeros(numActions * np.power(k + 1, m)), sigma, popSize,
                numElite, numEps, policyEval, epsilon)

    for trial in range(numTrials):
        print("Trial ", trial)
        for it in range(numIters):
            print("Iteration ", it)
            agent.train()
        policyEval.endTrial()
        agent.reset()
        policyEval.plot('learningCurve_cartpole_CEM_{}.png'.format(trial),
                        "Learning Curve - Cartpole with CEM Agent")
示例#2
0
def problem1():
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """

    #TODO
    print("Problem 1")

    numStates = 25
    numActions = 4

    numTrials = 50
    numIters = 150
    numEps = 25
    popSize = 20

    numElite = 10
    epsilon = 1.5
    sigma = 0.25
    policyEval = GridworldEvaluation()
    agent = CEM(np.zeros(numStates * numActions), sigma, popSize, numElite,
                numEps, policyEval, epsilon)

    for trial in range(numTrials):
        print("Trial ", trial)
        for it in range(numIters):
            print("Iteration ", it)
            agent.train()
        policyEval.endTrial()
        agent.reset()
        policyEval.plot('learningCurve_gridworld_CEM_{}.png'.format(trial),
                        "Learning Curve - Gridworld with CEM Agent")
示例#3
0
def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO

    print("cem-cartpole-softmax_theta_phi")

    state = np.array([0, 0, 0, 0])

    env = Cartpole()
    env.nextState(state, 0)

    fourier_param = 4

    theta = np.zeros(2 * fourier_param**4)
    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 5
    #    numEpisodes = 20
    evaluate = EvaluateCartpole()
    epsilon = 0.005

    cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon)

    #    numTrials = 50
    numTrials = 10
    #    numIterations = 250
    numIterations = 100

    #    total_episodes = 20,000
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        cem.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cart cem: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            cem.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "cem_cartpole"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)
示例#4
0
def problem1():
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """
    print("cem-gridworld-tabular_softmax")

    theta = np.zeros(100)
    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 10
    evaluate = Evaluate()
    epsilon = 5

    cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon)

    #    numTrials = 50
    numTrials = 50
    numIterations = 250
    #    numIterations = 50
    #    numIterations = 20
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        cem.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cem: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            cem.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 fmt='o',
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "cem_gridworld"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)