Python GA.train 예제들, rl687.agents.ga.GA.train Python 예제들

예제 #1

0

파일 보기

파일: homework2.py 프로젝트: subendhu19/rl-hw2

def problem3(config, iterations: int = 200):
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this assignment) on the More-Watery 687-Gridworld domain. Report the same 
    quantities.
    """
    all_returns = []

    def evaluate(p, episodes):
        returns = []
        for i in range(episodes):
            r = run_gridworld_episode(p)
            returns.append(r)
            all_returns.append(r)

        return np.mean(returns)

    agent_policy = TabularSoftmax(25, 4)
    agent = GA(populationSize=config[0],
               evaluationFunction=evaluate,
               initPopulationFunction=init_gridworld_population,
               numElite=config[1],
               numEpisodes=config[2],
               alpha=config[3],
               parent_frac=config[4])
    bar = range(iterations)
    for i in bar:
        agent_policy.parameters = agent.train()
        # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5)))
    return np.array(all_returns)

예제 #2

0

파일 보기

def problem6():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """

    #TODO
    print("Problem 6")

    # Environment Params
    m = 4
    numActions = 2

    # Policy Search Params
    numTrials = 50
    numGenerations = 20
    populationSize = 20
    numEpisodes = 10

    numElite = 10
    numTruncate = 5
    alpha = 0.1
    k = 3
    policyEval = CartPoleEvaluation(k=k)
    initGA = GAInit(numActions * np.power(k + 1, m))

    # print("Trials: %d\nIterations: %d\nEpisodes: %d\nSigma: %f" % (numTrials, numIters, numEps, sigma))

    agent = GA(populationSize,
               policyEval,
               initGA,
               numElite=numElite,
               numTruncate=numTruncate,
               alpha=alpha,
               numEpisodes=numEpisodes)
    for trial in range(numTrials):
        print("Trial ", trial)
        for gen in range(numGenerations):
            print("Generation ", gen)
            agent.train()
        policyEval.endTrial()
        agent.reset()
        policyEval.plot('learningCurve_cartpole_GA_{}.png'.format(trial),
                        "Learning Curve - Cartpole with GA Agent")

예제 #3

0

파일 보기

def problem3():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this assignment) on the More-Watery 687-Gridworld domain. Report the same 
    quantities.
    """

    #TODO
    print("Problem 3")

    # Environment Params
    num_states = 25
    num_actions = 4

    # Policy Search Params
    numTrials = 50
    numGenerations = 100
    populationSize = 30
    numEpisodes = 20

    numElite = 20
    numTruncate = 5
    alpha = 1.25
    policyEval = GridworldEvaluation()
    initGA = GAInit(num_states * num_actions)

    # print("Trials: %d\nIterations: %d\nEpisodes: %d\nSigma: %f" % (numTrials, numIters, numEps, sigma))

    agent = GA(populationSize,
               policyEval,
               initGA,
               numElite=numElite,
               numTruncate=numTruncate,
               alpha=alpha,
               numEpisodes=numEpisodes)
    for trial in range(numTrials):
        print("Trial ", trial)
        for gen in range(numGenerations):
            print("Generation ", gen)
            agent.train()
        policyEval.endTrial()
        agent.reset()
        policyEval.plot('learningCurve_gridworld_GA_{}.png'.format(trial),
                        "Learning Curve - Gridworld with GA Agent")

예제 #4

0

파일 보기

def problem3(para: dict, trails: int = 50):
    """
    Repeat the previous question, but using the GA (as described earlier in
    this assignment) on the More-Watery 687-Gridworld domain. Report the same
    quantities.
    """

    popSize = para['popSize']
    numElite = para['numElite']
    numEpisodes = para['numEpisodes']
    alpha = para['alpha']
    print('popSize:{}\tnumElite:{}\tnumEpisodes:{}\talpha:{}'.format(
        popSize, numElite, numEpisodes, alpha))
    mean_return_log = []

    def evaluate(theta, numEpisodes):
        eva_policy = TabularSoftmax(25, 4)
        eva_policy.parameters = theta
        returns = runEnvironment_gridworld(eva_policy, numEpisodes)
        mean_return = np.mean(returns, axis=0)
        mean_return_log.append(mean_return)
        # print(mean_return)
        return mean_return

    def initPopulation(popSize: int):
        population = np.random.normal(0, 1,
                                      (popSize, 25 * 4))  # Initialize randomly
        return population

    # policy = TabularSoftmax(25, 4)
    agent = GA(populationSize=popSize,
               numElite=numElite,
               numEpisodes=numEpisodes,
               evaluationFunction=evaluate,
               alpha=alpha,
               initPopulationFunction=initPopulation)
    for i in range(trails):
        agent.train()
        print('Episode {} finished'.format(i))

    return mean_return_log

예제 #5

0

파일 보기

def problem6(para: dict, trails: int = 50):
    """
    Repeat the previous question, but using the GA (as described earlier in
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized.
    """
    popSize = para['popSize']
    numElite = para['numElite']
    numEpisodes = para['numEpisodes']
    alpha = para['alpha']
    print('popSize:{}\tnumElite:{}\tnumEpisodes:{}\talpha:{}'.format(
        popSize, numElite, numEpisodes, alpha))
    mean_return_log = []

    def evaluate(theta, numEpisodes):
        eva_policy = LinearSoftmax(4, 2, 2)
        eva_policy.parameters = theta
        returns = runEnvironment_carpole(eva_policy, numEpisodes)
        mean_return = np.mean(returns, axis=0)
        mean_return_log.append(mean_return)
        # print(mean_return)
        return mean_return

    def initPopulation(popSize: int):
        population = np.random.normal(0, 1,
                                      (popSize, 2 * 81))  # Initialize randomly
        return population

    agent = GA(populationSize=popSize,
               numElite=numElite,
               numEpisodes=numEpisodes,
               evaluationFunction=evaluate,
               alpha=alpha,
               initPopulationFunction=initPopulation)
    for i in range(trails):
        agent.train()
        print('Episode {} finished'.format(i))

    return mean_return_log

예제 #6

0

파일 보기

파일: homework2.py 프로젝트: subendhu19/rl-hw2

def problem6(config, iterations: int = 25):
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """
    all_returns = []

    def evaluate(p, episodes):
        returns = []
        for i in range(episodes):
            r = run_cartpole_episode(p, config[0])
            returns.append(r)
            all_returns.append(r)

        return np.mean(returns)

    agent_policy = LinearApproximation(state_dim=4,
                                       num_actions=2,
                                       basis=config[0])
    if config[0] == 2:
        agent = GA(populationSize=config[1],
                   evaluationFunction=evaluate,
                   initPopulationFunction=init_cartpole_population_2,
                   numElite=config[2],
                   numEpisodes=config[3],
                   alpha=config[4],
                   parent_frac=config[5])
    else:
        agent = GA(populationSize=config[1],
                   evaluationFunction=evaluate,
                   initPopulationFunction=init_cartpole_population_3,
                   numElite=config[2],
                   numEpisodes=config[3],
                   alpha=config[4],
                   parent_frac=config[5])
    for i in range(iterations):
        agent_policy.parameters = agent.train()

    return np.array(all_returns)

예제 #7

0

파일 보기

def problem6():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """
    
    #TODO
    
    populationSize = 20 #20
    numElite = 5 #5
    numEpisodes = 5 #5
    numTrials = 50 #50
    numIterations = 20 #20
    Kp = 10 #10
    alpha = 2.5 #2.5
    k = 2 #2

    returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        cartpole = Cartpole()

        tabular_softmax = TabularSoftmaxContinuous(k, 2)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 1000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = cartpole.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = cartpole.step(action)
                    G += (discount) * reward
                    discount *= cartpole.gamma
                    if end == True:
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                cartpole.reset()
                count += 1

            return expected_reward / numEpisodes


        
        def initPopulation(populationSize : int) -> np.ndarray:
            return np.random.randn(populationSize, tabular_softmax.parameters.shape[0])


        agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha)

        

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * populationSize : count])
            print(iteration * numEpisodes * populationSize)
            print(count)
#             l = [[0 for i in range(5)] for j in range(5)] 
#             for i in range(25):
#                 s = tabular_softmax.getActionProbabilities(i)
#                 print(s)
#                 r = np.argmax(s)
#                 if(r == 0):
#                     l[i//5][i % 5] = '↑'
#                 elif(r == 1):
#                     l[i//5][i % 5] = '↓'
#                 elif(r == 2):
#                     l[i//5][i % 5] = '←'
#                 elif(r == 3):
#                     l[i//5][i % 5] = '→'

#             for i in range(5):
#                 print(l[i])
        print(p)
            
    plot(returns, 'Cartpole domain Genetic Algorithm (standard deviation error bars) - 50 trials', 1000)

예제 #8

0

파일 보기

def problem3():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this assignment) on the More-Watery 687-Gridworld domain. Report the same 
    quantities.
    """

    #TODO
    
    populationSize = 40 # 40
    numElite = 20 # 20
    numEpisodes = 20 # 20
    numTrials = 50 #50
    numIterations = 100 # 100
    Kp = 30 # 30
    alpha = 3.0 # 3.0

    returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        gridworld = Gridworld()

        tabular_softmax = TabularSoftmax(25, 4)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 10000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = gridworld.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = gridworld.step(action)
                    G += (discount) * reward
                    discount *= gridworld.gamma
                    if end == True:
                        break
                    elif t == 200:
                        G = -50
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                gridworld.reset()
                count += 1

            return expected_reward / numEpisodes
        
        def initPopulation(populationSize : int) -> np.ndarray:
            return np.random.randn(populationSize, tabular_softmax.parameters.shape[0])


        agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha)

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * populationSize : count])
            print(np.mean(returns[trial][iteration * numEpisodes * populationSize : count]))
            l = [[0 for i in range(5)] for j in range(5)] 
            for i in range(25):
                k = tabular_softmax.getActionProbabilities(i)
#                 print(k)
                r = np.argmax(k)
                if(r == 0):
                    l[i//5][i % 5] = '↑'
                elif(r == 1):
                    l[i//5][i % 5] = '↓'
                elif(r == 2):
                    l[i//5][i % 5] = '←'
                elif(r == 3):
                    l[i//5][i % 5] = '→'

            for i in range(5):
                print(l[i])
        print(p)
            
    plot(returns, 'More-Watery 687-Gridworld domain Genetic Algorithm (standard deviation error bars) - 50 trials', 3)

예제 #9

0

파일 보기

파일: homework2.py 프로젝트: kshitimehta/RL-Project

def problem6():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this homework) on the cart-pole domain. Report the same quantities and how
    the policy was parameterized. 
    """

    print("ga-cartpole-softmax_theta_phi")

    #    fourier_param = 2
    def initPopFn(pop_size):
        theta_arr = np.zeros((pop_size, 2 * fourier_param**4))
        return theta_arr

    state = np.array([0, 0, 0, 0])

    env = Cartpole()
    env.nextState(state, 0)

    fourier_param = 4

    #    theta = np.zeros(2*fourier_param**4)
    #    sigma = 1
    popSize = 10
    numElite = 3
    numEpisodes = 5
    evaluate = EvaluateCartpole()
    #    epsilon = 0.005

    ga = GA(popSize, evaluate, initPopFn, numElite, numEpisodes)

    #    numTrials = 50
    numTrials = 10
    numIterations = 100
    #    numIterations = 250
    #    numIterations = 20
    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    results = np.zeros((numTrials, total_episodes))

    for trial in range(numTrials):
        ga.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("cart ga: ", "trial: ", trial, "/", numTrials,
                      " iteration: ", i, "/", numIterations)
            ga.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize

            results[trial,
                    batch_start:batch_end] = np.array(evaluate.batchReturn)

    average_results = np.average(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array([i for i in range(maximumEpisodes)]),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "ga_cartpole"

    now = datetime.now()
    param_string = "_numTrials_"+str(numTrials)+"_numIter_" \
        + str(numIterations) + "_popSize_" +str(popSize)
    dt_string = now.strftime("_t_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)

    #TODO
    pass

예제 #10

0

파일 보기

파일: homework2.py 프로젝트: kshitimehta/RL-Project

def problem3():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this assignment) on the More-Watery 687-Gridworld domain. Report the same 
    quantities.
    """
    def initPopFn(pop_size):
        #        theta_arr = np.zeros((pop_size,100))
        theta_zeros = np.zeros(100)
        theta_arr = np.random.multivariate_normal(theta_zeros,
                                                  np.identity(
                                                      len(theta_zeros)),
                                                  size=pop_size)
        #        print(theta_arr.shape)
        #        return child
        return theta_arr

    print("ga-gridworld-tabular_softmax")

    popSize = 10
    evaluate = Evaluate()
    numElite = 4
    numEpisodes = 10
    #    numEpisodes = 50

    ga = GA(popSize, evaluate, initPopFn, numElite, numEpisodes)

    #    numTrials = 50
    numTrials = 20
    #    numIterations = 100
    numIterations = 25

    total_episodes = numIterations * numEpisodes * popSize  # 20*50*10

    #    total_episodes = numIterations*num_episodes # 100*50

    results = np.zeros((numTrials, total_episodes))
    #    results = []
    #    iter_results = []
    for trial in range(numTrials):
        ga.reset()
        for i in range(numIterations):
            #DEBUG
            if (i % 5 == 0):
                print("ga: ", "trial: ", trial, "/", numTrials, " iteration: ",
                      i, "/", numIterations)
            ga.train()

            batch_start = (i * numEpisodes) * popSize
            batch_end = ((i + 1) * numEpisodes) * popSize
            results[trial,batch_start:batch_end] =\
                    np.array(evaluate.batchReturn)


#            np.evaluate.batchReturn

    average_results = np.mean(np.array(results), axis=0)
    std_results = np.std(np.array(results), axis=0)
    maximumEpisodes = average_results.shape[0]
    max_avg = np.max(average_results)

    plt.errorbar(np.array(range(maximumEpisodes)),
                 average_results,
                 std_results,
                 marker='.',
                 ecolor='aqua')
    plt.grid(True)
    plt.axhline(max_avg)
    plt.text(0,
             max_avg,
             "max: " + str(round(max_avg, 2)),
             fontsize=15,
             backgroundcolor='w')

    plt_name = "ga_gridworld"

    now = datetime.now()
    param_string = "_numTrials_" + str(numTrials) + "_numIter_" + str(
        numIterations)
    dt_string = now.strftime("_%H_%M")

    plt_name += param_string
    plt_name += dt_string
    print("plt_name=", plt_name)
    plt_path = "images/" + plt_name + ".png"

    #    plot_min = -100
    #    plot_max = 10
    #    plt.ylim(average_results.min(), average_results.max())
    #    plt.ylim(plot_min, plot_max)

    plt.savefig(plt_path, dpi=200)
    plt.show()

    np.save("data/" + "results_" + plt_name, results)
    np.save("data/" + "average_results_" + plt_name, average_results)
    np.save("data/" + "std_results_" + plt_name, std_results)