Python Cartpole示例，rl687.environments.cartpole.Cartpole Python示例

示例#1

0

显示文件

 def __call__(self, parameters: np.array, numEpisodes: int):
     # print("Evaluating Cartpole")
     G = []
     # self.policy.parameters = policy
     policy = SoftmaxWithLFA(4, 2, self.k)
     policy.parameters = parameters
     # threadPool = Pool(numEpisodes)
     # G = threadPool.map(self.runEpisode, [policy for i in range(numEpisodes)])
     # print("G", G)
     # self.curTrialReturns.extend(G)
     env = Cartpole()
     for ep in range(numEpisodes):
         # print("Episode ", ep)
         env.reset()
         Gi = 0
         while not env.isEnd:
             state = env.state
             action = policy.samplAction(state)
             next_state, reward, _ = env.step(action)
             Gi += reward
         G.append(Gi)
         self.lock.acquire()
         self.curTrialReturns.append(Gi)
         # print("Number of returns collected int trial", len(self.curTrialReturns))
         self.lock.release()
     # threadPool.close()
     # threadPool.join()
     print("Mean Return ", np.mean(G))
     return np.mean(G)

示例#2

0

显示文件

文件： GenerateData.py 项目： kshitimehta/RL-Project

    def __init__(self, numStates: int, numActions: int, k: int):
        # self.G = []
        self._numStates = numStates
        self._numActions = numActions
        self._k = k

        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(numStates, numActions, k)

示例#3

0

显示文件

 def runEpisode(self, policy):
     env = Cartpole()
     Gi = 0
     while not env.isEnd:
         state = env.state
         action = policy.samplAction(state)
         next_state, reward, _ = env.step(action)
         Gi += reward
     return Gi

示例#4

0

显示文件

文件： GenerateData.py 项目： kshitimehta/RL-Project

class GenEpCartpole:
    def __init__(self, numStates: int, numActions: int, k: int):
        # self.G = []
        self._numStates = numStates
        self._numActions = numActions
        self._k = k

        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(numStates, numActions, k)
#        self.policy.k = k

#    @property
#    def batchReturn(self)->str:
#        return self._G

    def __call__(self, theta: np.array, numEpisodes: int):

        self.policy.parameters = theta

        D = []

        for episode in range(numEpisodes):

            self.environment.reset()
            G_episode = 0

            counter = 0
            H = {}
            S = []
            A = []
            R = []
            while not self.environment.isEnd:
                state = self.environment.state
                action = self.policy.samplAction(state)
                _, reward, _ = self.environment.step(action)

                G_episode += reward

                phi_s = self.policy.phiS(state)
                S.append(phi_s)
                A.append(action)
                R.append(reward)

                counter += 1

            H['S'] = np.array(S)
            H['A'] = np.array(A)
            H['R'] = np.array(R)

            D.append(H)

        return D

    def reset(self):
        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(self._numStates, self._numActions)
        self.policy.k = self._k

示例#5

0

显示文件

文件： homework2.py 项目： subendhu19/rl-hw2

def run_cartpole_episode(p, basis):
    environment = Cartpole()
    policy = LinearApproximation(state_dim=4, num_actions=2, basis=basis)
    policy.parameters = p
    is_end = False
    discounted_return = 0
    t = 0
    while not is_end:
        action = policy.samplAction(environment.state)
        new_state, reward, is_end = environment.step(action)
        discounted_return += (environment.gamma**t) * reward
        t += 1
    environment.reset()
    return discounted_return

示例#6

0

显示文件

def runHistory(getAction, numeps=10000):
    histories = []

    cartPole = Cartpole()
    for ep in range(numeps):
        history = []
        cartPole.reset()
        history.append(cartPole.state)
        step = 0
        while not cartPole.isEnd:
            s, r, e = cartPole.step(getAction(cartPole.state))
            history.append(cartPole.action)
            history.append(cartPole.reward)
            history.append(cartPole.state)
        histories.append(history)

    return histories

示例#7

0

显示文件

def runEnvironment_carpole(policy, numeps=10000):
    returns = np.zeros(numeps)

    env = Cartpole()
    for ep in range(numeps):
        env.reset()
        step = 0
        g = 0
        while not env.isEnd:
            action = policy.samplAction(env.state)
            s, r, e = env.step(action)
            g += (env.gamma**step) * r
            step += 1
            if step > 200:
                g = -50
                break
        returns[ep] = g
    return returns

示例#8

0

显示文件

文件： evaluate_cartpole.py 项目： kshitimehta/RL-Project

class EvaluateCartpole:
    def __init__(self):
        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(4, 2)
        self._G = []

    @property
    def batchReturn(self) -> str:
        return self._G

    def __call__(self, theta: np.array, numEpisodes: int):

        self.policy.parameters = theta

        for episode in range(numEpisodes):

            self.environment.reset()
            G_episode = 0

            counter = 0

            while not self.environment.isEnd:

                state = self.environment.state
                action = self.policy.samplAction(state)
                _, reward, _ = self.environment.step(action)

                G_episode += reward

                counter += 1

            self._G.append(G_episode)

        return np.mean(self._G)

    def reset(self):
        self.environment = Cartpole()
        self.policy = SoftmaxThetaPhi(4, 2)
        self._G = []

示例#9

0

显示文件

def problem6():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """
    
    #TODO
    
    populationSize = 20 #20
    numElite = 5 #5
    numEpisodes = 5 #5
    numTrials = 50 #50
    numIterations = 20 #20
    Kp = 10 #10
    alpha = 2.5 #2.5
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                cartpole.reset()
                count += 1

            return expected_reward / numEpisodes


        
        def initPopulation(populationSize : int) -> np.ndarray:
            return np.random.randn(populationSize, tabular_softmax.parameters.shape[0])


        agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha)

        

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * populationSize : count])
            print(iteration * numEpisodes * populationSize)
            print(count)
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain Genetic Algorithm (standard deviation error bars) - 50 trials', 1000)

示例#10

0

显示文件

def problem5():
    """
    Repeat the previous question, but using first-choice hill-climbing (as 
    described in class) on the cart-pole domain. Report the same quantities 
    and how the policy was parameterized. 
    
    """
    #TODO
    
    
    sigma = 1.0

    numEpisodes = 150 #100
    numTrials = 50 #50
    numIterations = 75 #50
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = -1


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                if(count != -1):
                    returns[trial][count] = G
                    count += 1
                cartpole.reset()

            return expected_reward / numEpisodes


        agent = FCHC(theta, sigma, evaluateFunction, numEpisodes)

        count = 0

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes : count])
            print(np.mean(returns[trial][iteration * numEpisodes : count]))
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain First Choice Hill Climbing (standard deviation error bars) - 50 trials', 1000)

示例#11

0

显示文件

def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO
    
    popSize = 10 #10
    numElite = 5 #5
    epsilon = 4.0 #4.0
    sigma = 1.0 #1.0
    numEpisodes = 20 #20
    numTrials = 5 #5
    numIterations = 40 #40
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations * popSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                cartpole.reset()
                count += 1

            return expected_reward / numEpisodes


        agent = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluateFunction, epsilon)

        

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * popSize : count])
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain Cross Entropy Method (standard deviation error bars) - 5 trials', 1000)

示例#12

0

显示文件

文件： GenerateData.py 项目： kshitimehta/RL-Project

 def reset(self):
     self.environment = Cartpole()
     self.policy = SoftmaxThetaPhi(self._numStates, self._numActions)
     self.policy.k = self._k

示例#13

0

显示文件

文件： evaluate_cartpole.py 项目： kshitimehta/RL-Project

 def __init__(self):
     self.environment = Cartpole()
     self.policy = SoftmaxThetaPhi(4, 2)
     self._G = []

示例#14

0

显示文件

文件： evaluate_cartpole.py 项目： kshitimehta/RL-Project

 def reset(self):
     self.environment = Cartpole()
     self.policy = SoftmaxThetaPhi(4, 2)
     self._G = []

示例#15

0

显示文件

文件： homework2.py 项目： kshitimehta/RL-Project

def problem6():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """

    print("ga-cartpole-softmax_theta_phi")

    #    fourier_param = 2
    def initPopFn(pop_size):
        theta_arr = np.zeros((pop_size, 2 * fourier_param**4))
        return theta_arr

    state = np.array([0, 0, 0, 0])

    env = Cartpole()
    env.nextState(state, 0)

    fourier_param = 4

    #    theta = np.zeros(2*fourier_param**4)
    #    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 5
    evaluate = EvaluateCartpole()
    #    epsilon = 0.005

    ga = GA(popSize, evaluate, initPopFn, numElite, numEpisodes)

    #    numTrials = 50
    numTrials = 10
    numIterations = 100
    #    numIterations = 250
    #    numIterations = 20
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        ga.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cart ga: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            ga.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "ga_cartpole"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)

    #TODO
    pass

示例#16

0

显示文件

文件： homework2.py 项目： kshitimehta/RL-Project

def problem4():
    """
    Repeat the previous question, but using the cross-entropy method on the 
    cart-pole domain. Notice that the state is not discrete, and so you cannot 
    directly apply a tabular softmax policy. It is up to you to create a 
    representation for the policy for this problem. Consider using the softmax 
    action selection using linear function approximation as described in the notes. 
    Report the same quantities, as well as how you parameterized the policy. 
    
    """

    #TODO

    print("cem-cartpole-softmax_theta_phi")

    state = np.array([0, 0, 0, 0])

    env = Cartpole()
    env.nextState(state, 0)

    fourier_param = 4

    theta = np.zeros(2 * fourier_param**4)
    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 5
    #    numEpisodes = 20
    evaluate = EvaluateCartpole()
    epsilon = 0.005

    cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon)

    #    numTrials = 50
    numTrials = 10
    #    numIterations = 250
    numIterations = 100

    #    total_episodes = 20,000
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        cem.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cart cem: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            cem.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "cem_cartpole"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)