示例#1
0
def problem4(config, iterations: int = 25):
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """
    all_returns = []

    def evaluate(p, episodes):
        returns = []
        for i in range(episodes):
            r = run_cartpole_episode(p, config[0])
            returns.append(r)
            all_returns.append(r)

        return np.mean(returns)

    agent_policy = LinearApproximation(state_dim=4,
                                       num_actions=2,
                                       basis=config[0])
    agent = CEM(agent_policy.parameters,
                sigma=config[1],
                popSize=config[2],
                numElite=config[3],
                numEpisodes=config[4],
                evaluationFunction=evaluate,
                epsilon=config[5])
    for i in range(iterations):
        agent_policy.parameters = agent.train()

    return np.array(all_returns)
示例#2
0
def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO
    print("Problem 4")

    m = 4
    numActions = 2

    numTrials = 50
    numEps = 10
    numIters = 30
    popSize = 10
    numElite = 5

    epsilon = 1.25
    sigma = 0.1
    k = 3
    policyEval = CartPoleEvaluation(k=k)
    # print ("Size of theta = ", numActions*np.power(k+1, m))
    agent = CEM(np.zeros(numActions * np.power(k + 1, m)), sigma, popSize,
                numElite, numEps, policyEval, epsilon)

    for trial in range(numTrials):
        print("Trial ", trial)
        for it in range(numIters):
            print("Iteration ", it)
            agent.train()
        policyEval.endTrial()
        agent.reset()
        policyEval.plot('learningCurve_cartpole_CEM_{}.png'.format(trial),
                        "Learning Curve - Cartpole with CEM Agent")
示例#3
0
def problem1():
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """

    #TODO
    print("Problem 1")

    numStates = 25
    numActions = 4

    numTrials = 50
    numIters = 150
    numEps = 25
    popSize = 20

    numElite = 10
    epsilon = 1.5
    sigma = 0.25
    policyEval = GridworldEvaluation()
    agent = CEM(np.zeros(numStates * numActions), sigma, popSize, numElite,
                numEps, policyEval, epsilon)

    for trial in range(numTrials):
        print("Trial ", trial)
        for it in range(numIters):
            print("Iteration ", it)
            agent.train()
        policyEval.endTrial()
        agent.reset()
        policyEval.plot('learningCurve_gridworld_CEM_{}.png'.format(trial),
                        "Learning Curve - Gridworld with CEM Agent")
示例#4
0
def problem1(para: dict, trails: int = 50):
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular
    softmax policy. Search the space of hyperparameters for hyperparameters
    that work well. Report how you searched the hyperparameters,
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be
    over any number of episodes, but should show convergence to a nearly
    optimal policy. The plot should average over at least 500 trials and
    should include standard error or standard deviation error bars. Say which
    error bar variant you used.
    """
    sigma = para['sigma']
    popSize = para['popSize']
    numElite = para['numElite']
    numEpisodes = para['numEpisodes']
    epsilon = para['epsilon']
    mean_return_log = []

    print(
        'sigma:{}\tpopSize:{}\tnumElite:{}\tnumEpisodes:{}\tepsilon:{}'.format(
            sigma, popSize, numElite, numEpisodes, epsilon))

    def evaluate(theta, numEpisodes):
        eva_policy = TabularSoftmax(25, 4)
        eva_policy.parameters = theta
        returns = runEnvironment_gridworld(eva_policy, numEpisodes)
        mean_return = np.mean(returns)
        mean_return_log.append(mean_return)
        # print(mean_return)
        return mean_return

    policy = TabularSoftmax(25, 4)
    agent = CEM(theta=policy.parameters,
                sigma=sigma,
                popSize=popSize,
                numElite=numElite,
                numEpisodes=numEpisodes,
                evaluationFunction=evaluate,
                epsilon=epsilon)
    for i in range(trails):
        policy.parameters = agent.train()
        print('Episode {} finished'.format(i))

    return mean_return_log
示例#5
0
def problem4(para: dict, trails: int = 50):
    """
    Repeat the previous question, but using the cross-entropy method on the
    cart-pole domain. Notice that the state is not discrete, and so you cannot
    directly apply a tabular softmax policy. It is up to you to create a
    representation for the policy for this problem. Consider using the softmax
    action selection using linear function approximation as described in the notes.
    Report the same quantities, as well as how you parameterized the policy.

    """
    sigma = para['sigma']
    popSize = para['popSize']
    numElite = para['numElite']
    numEpisodes = para['numEpisodes']
    epsilon = para['epsilon']
    mean_return_log = []

    print(
        'sigma:{}\tpopSize:{}\tnumElite:{}\tnumEpisodes:{}\tepsilon:{}'.format(
            sigma, popSize, numElite, numEpisodes, epsilon))

    def evaluate(theta, numEpisodes):
        eva_policy = LinearSoftmax(4, 2, 2)
        eva_policy.parameters = theta
        returns = runEnvironment_carpole(eva_policy, numEpisodes)
        mean_return = np.mean(returns)
        mean_return_log.append(mean_return)
        # print(mean_return)
        return mean_return

    policy = LinearSoftmax(4, 2, 2)
    agent = CEM(theta=policy.parameters,
                sigma=sigma,
                popSize=popSize,
                numElite=numElite,
                numEpisodes=numEpisodes,
                evaluationFunction=evaluate,
                epsilon=epsilon)
    for i in range(trails):
        policy.parameters = agent.train()
        print('Episode {} finished'.format(i))

    return mean_return_log
示例#6
0
def problem1(config, iterations: int = 200):
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """
    all_returns = []

    def evaluate(p, episodes):
        returns = []
        for i in range(episodes):
            r = run_gridworld_episode(p)
            returns.append(r)
            all_returns.append(r)

        return np.mean(returns)

    agent_policy = TabularSoftmax(25, 4)
    agent = CEM(agent_policy.parameters,
                sigma=config[0],
                popSize=config[1],
                numElite=config[2],
                numEpisodes=config[3],
                evaluationFunction=evaluate,
                epsilon=config[4])
    bar = range(iterations)
    for i in bar:
        agent_policy.parameters = agent.train()
        # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5)))
    return np.array(all_returns)
示例#7
0
def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO
    
    popSize = 10 #10
    numElite = 5 #5
    epsilon = 4.0 #4.0
    sigma = 1.0 #1.0
    numEpisodes = 20 #20
    numTrials = 5 #5
    numIterations = 40 #40
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations * popSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                cartpole.reset()
                count += 1

            return expected_reward / numEpisodes


        agent = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluateFunction, epsilon)

        

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * popSize : count])
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain Cross Entropy Method (standard deviation error bars) - 5 trials', 1000)
示例#8
0
                returns[trial][count] = G
                gridworld.reset()
                count += 1

            return expected_reward / numEpisodes


        agent = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluateFunction, epsilon)

        

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            l = [[0 for i in range(5)] for j in range(5)] 
            for i in range(25):
                k = tabular_softmax.getActionProbabilities(i)
                print(k)
                r = np.argmax(k)
                if(r == 0):
                    l[i//5][i % 5] = '↑'
                elif(r == 1):
                    l[i//5][i % 5] = '↓'
                elif(r == 2):
                    l[i//5][i % 5] = '←'
                elif(r == 3):
                    l[i//5][i % 5] = '→'

            for i in range(5):
示例#9
0
def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO

    print("cem-cartpole-softmax_theta_phi")

    state = np.array([0, 0, 0, 0])

    env = Cartpole()
    env.nextState(state, 0)

    fourier_param = 4

    theta = np.zeros(2 * fourier_param**4)
    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 5
    #    numEpisodes = 20
    evaluate = EvaluateCartpole()
    epsilon = 0.005

    cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon)

    #    numTrials = 50
    numTrials = 10
    #    numIterations = 250
    numIterations = 100

    #    total_episodes = 20,000
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        cem.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cart cem: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            cem.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "cem_cartpole"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)
示例#10
0
def problem1():
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """
    print("cem-gridworld-tabular_softmax")

    theta = np.zeros(100)
    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 10
    evaluate = Evaluate()
    epsilon = 5

    cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon)

    #    numTrials = 50
    numTrials = 50
    numIterations = 250
    #    numIterations = 50
    #    numIterations = 20
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        cem.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cem: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            cem.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 fmt='o',
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "cem_gridworld"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)