Python TabularSoftmax示例，rl687.policies.tabular_softmax.TabularSoftmax Python示例

示例#1

0

显示文件

def problem2(para: dict, trails: int = 50):
    """
    Repeat the previous question, but using first-choice hill-climbing on the
    More-Watery 687-Gridworld domain. Report the same quantities.
    """
    sigma = para['sigma']
    numEpisodes = para['numEpisodes']
    mean_return_log = []

    print('sigma:{}\tnumEpisodes:{}\t'.format(sigma, numEpisodes))

    def evaluate(theta, numEpisodes):
        eva_policy = TabularSoftmax(25, 4)
        eva_policy.parameters = theta
        returns = runEnvironment_gridworld(eva_policy, numEpisodes)
        mean_return = np.mean(returns)
        mean_return_log.append(mean_return)
        # print(mean_return)
        return mean_return

    policy = TabularSoftmax(25, 4)
    agent = FCHC(theta=policy.parameters,
                 sigma=sigma,
                 numEpisodes=numEpisodes,
                 evaluationFunction=evaluate)
    for i in range(trails):
        policy.parameters = agent.train()
        print('Episode {} finished'.format(i))

    return mean_return_log

示例#2

0

显示文件

文件： homework2.py 项目： subendhu19/rl-hw2

def problem2(config, iterations: int = 1000):
    """
    Repeat the previous question, but using first-choice hill-climbing on the 
    More-Watery 687-Gridworld domain. Report the same quantities.
    """
    all_returns = []

    def evaluate(p, episodes):
        returns = []
        for i in range(episodes):
            r = run_gridworld_episode(p)
            returns.append(r)
            all_returns.append(r)

        return np.mean(returns)

    agent_policy = TabularSoftmax(25, 4)
    agent = FCHC(agent_policy.parameters,
                 sigma=config[0],
                 evaluationFunction=evaluate,
                 numEpisodes=config[1])
    bar = range(iterations)
    for i in bar:
        agent_policy.parameters = agent.train()
        # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5)))
    return np.array(all_returns)

示例#3

0

显示文件

文件： homework2.py 项目： subendhu19/rl-hw2

def problem3(config, iterations: int = 200):
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this assignment) on the More-Watery 687-Gridworld domain. Report the same 
    quantities.
    """
    all_returns = []

    def evaluate(p, episodes):
        returns = []
        for i in range(episodes):
            r = run_gridworld_episode(p)
            returns.append(r)
            all_returns.append(r)

        return np.mean(returns)

    agent_policy = TabularSoftmax(25, 4)
    agent = GA(populationSize=config[0],
               evaluationFunction=evaluate,
               initPopulationFunction=init_gridworld_population,
               numElite=config[1],
               numEpisodes=config[2],
               alpha=config[3],
               parent_frac=config[4])
    bar = range(iterations)
    for i in bar:
        agent_policy.parameters = agent.train()
        # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5)))
    return np.array(all_returns)

示例#4

0

显示文件

 def evaluate(theta, numEpisodes):
     eva_policy = TabularSoftmax(25, 4)
     eva_policy.parameters = theta
     returns = runEnvironment_gridworld(eva_policy, numEpisodes)
     mean_return = np.mean(returns)
     mean_return_log.append(mean_return)
     # print(mean_return)
     return mean_return

示例#5

0

显示文件

文件： homework2.py 项目： subendhu19/rl-hw2

def run_gridworld_episode(p):
    environment = Gridworld()
    policy = TabularSoftmax(25, 4)
    policy.parameters = p
    is_end = False
    discounted_return = 0
    t = 0
    while not is_end:
        action = policy.samplAction(environment.state)
        new_state, reward, is_end = environment.step(action)
        discounted_return += (environment.gamma**t) * reward
        t += 1
        if t > 200:
            discounted_return = -50
            break
    environment.reset()
    return discounted_return

示例#6

0

显示文件

def problem1(para: dict, trails: int = 50):
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular
    softmax policy. Search the space of hyperparameters for hyperparameters
    that work well. Report how you searched the hyperparameters,
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be
    over any number of episodes, but should show convergence to a nearly
    optimal policy. The plot should average over at least 500 trials and
    should include standard error or standard deviation error bars. Say which
    error bar variant you used.
    """
    sigma = para['sigma']
    popSize = para['popSize']
    numElite = para['numElite']
    numEpisodes = para['numEpisodes']
    epsilon = para['epsilon']
    mean_return_log = []

    print(
        'sigma:{}\tpopSize:{}\tnumElite:{}\tnumEpisodes:{}\tepsilon:{}'.format(
            sigma, popSize, numElite, numEpisodes, epsilon))

    def evaluate(theta, numEpisodes):
        eva_policy = TabularSoftmax(25, 4)
        eva_policy.parameters = theta
        returns = runEnvironment_gridworld(eva_policy, numEpisodes)
        mean_return = np.mean(returns)
        mean_return_log.append(mean_return)
        # print(mean_return)
        return mean_return

    policy = TabularSoftmax(25, 4)
    agent = CEM(theta=policy.parameters,
                sigma=sigma,
                popSize=popSize,
                numElite=numElite,
                numEpisodes=numEpisodes,
                evaluationFunction=evaluate,
                epsilon=epsilon)
    for i in range(trails):
        policy.parameters = agent.train()
        print('Episode {} finished'.format(i))

    return mean_return_log

示例#7

0

显示文件

文件： evaluate.py 项目： kshitimehta/RL-Project

class Evaluate:
    def __init__(self):
        self.environment = Gridworld()
        self.policy = TabularSoftmax(25,4)
        self._G = []
    
    @property
    def batchReturn(self)->str:
        return self._G
    
    def __call__(self, theta:np.array, numEpisodes:int):
#	    print("Evaluating Gridworld")
#        self._G = [] #reset G at every call
	    # environment = Gridworld()
        # policy = TabularSoftmax(25,4)
        self.policy.parameters = theta
#        print("numEpisodes",numEpisodes)
        
        Count = 200
        
        for episode in range(numEpisodes):
	        
            self.environment.reset()
            G_episode = 0
            
            counter = 0
            ctr=0
            while not self.environment.isEnd:

                if(counter>=Count):
                    G_episode = -50
                    break
                state = self.environment.state
                action = self.policy.samplAction(state)
                _, reward, _ = self.environment.step(action)
                
                G_episode += (self.environment.gamma**ctr)*reward
#                G_episode += reward
                
                counter+=1
                ctr+=1
	        # self.returns.append(Gi)
            self._G.append(G_episode)
#            if (episode % 50 == 0):
#                print(G_episode)

	    # print("Mean Return ", np.mean(G))
        return np.mean(self._G)
    
    def reset(self):
        self.environment = Gridworld()
        self.policy = TabularSoftmax(25,4)
        self._G = []

示例#8

0

显示文件

 def __call__(self, parameters: np.array, numEpisodes: int):
     # print("Evaluating Gridworld")
     G = []
     policy = TabularSoftmax(25, 4)
     policy.parameters = parameters
     env = Gridworld()
     for ep in range(numEpisodes):
         # print("Episode ", ep)
         env.reset()
         Gi = 0
         timeStep = 0
         while not env.isEnd:
             state = env.state
             action = policy.samplAction(state)
             _, next_state, reward = env.step(action)
             Gi += reward
             timeStep += 1
             if timeStep == 200:
                 Gi += -50
                 break
         G.append(Gi)
         self.curTrialReturns.append(Gi)
     print("Mean Return ", np.mean(G))
     return np.mean(G)

示例#9

0

显示文件

文件： homework2.py 项目： subendhu19/rl-hw2

def problem1(config, iterations: int = 200):
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """
    all_returns = []

    def evaluate(p, episodes):
        returns = []
        for i in range(episodes):
            r = run_gridworld_episode(p)
            returns.append(r)
            all_returns.append(r)

        return np.mean(returns)

    agent_policy = TabularSoftmax(25, 4)
    agent = CEM(agent_policy.parameters,
                sigma=config[0],
                popSize=config[1],
                numElite=config[2],
                numEpisodes=config[3],
                evaluationFunction=evaluate,
                epsilon=config[4])
    bar = range(iterations)
    for i in bar:
        agent_policy.parameters = agent.train()
        # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5)))
    return np.array(all_returns)

示例#10

0

显示文件

def problem1():
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """

    #TODO
    
    popSize = 10 #10
    numElite = 5 #5
    epsilon = 4.0 #4.0
    sigma = 1.0 #1.0
    numEpisodes = 20 #50
    numTrials = 5 #5
    numIterations = 50 #200

    returns = np.zeros((numTrials, numEpisodes * numIterations))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        gridworld = Gridworld()

        tabular_softmax = TabularSoftmax(25, 4)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0

        def evaluateFunction(theta, numEpisodes):
            nonlocal count

示例#11

0

显示文件

def problem3():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this assignment) on the More-Watery 687-Gridworld domain. Report the same 
    quantities.
    """

    #TODO
    
    populationSize = 40 # 40
    numElite = 20 # 20
    numEpisodes = 20 # 20
    numTrials = 50 #50
    numIterations = 100 # 100
    Kp = 30 # 30
    alpha = 3.0 # 3.0

    returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        gridworld = Gridworld()

        tabular_softmax = TabularSoftmax(25, 4)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 10000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = gridworld.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = gridworld.step(action)
                    G += (discount) * reward
                    discount *= gridworld.gamma
                    if end == True:
                        break
                    elif t == 200:
                        G = -50
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                gridworld.reset()
                count += 1

            return expected_reward / numEpisodes
        
        def initPopulation(populationSize : int) -> np.ndarray:
            return np.random.randn(populationSize, tabular_softmax.parameters.shape[0])


        agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha)

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * populationSize : count])
            print(np.mean(returns[trial][iteration * numEpisodes * populationSize : count]))
            l = [[0 for i in range(5)] for j in range(5)] 
            for i in range(25):
                k = tabular_softmax.getActionProbabilities(i)
#                 print(k)
                r = np.argmax(k)
                if(r == 0):
                    l[i//5][i % 5] = '↑'
                elif(r == 1):
                    l[i//5][i % 5] = '↓'
                elif(r == 2):
                    l[i//5][i % 5] = '←'
                elif(r == 3):
                    l[i//5][i % 5] = '→'

            for i in range(5):
                print(l[i])
        print(p)
            
    plot(returns, 'More-Watery 687-Gridworld domain Genetic Algorithm (standard deviation error bars) - 50 trials', 3)

示例#12

0

显示文件

def problem2():
    """
    Repeat the previous question, but using first-choice hill-climbing on the 
    More-Watery 687-Gridworld domain. Report the same quantities.
    """
    
    #TODO
    
    
    sigma = 1.0 #1.0

    numEpisodes = 200 #200
    numTrials = 50 #50
    numIterations = 200 #200

    returns = np.zeros((numTrials, numEpisodes * numIterations))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        gridworld = Gridworld()

        tabular_softmax = TabularSoftmax(25, 4)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = -1


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 10000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = gridworld.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = gridworld.step(action)
                    G += (discount) * reward
                    discount *= gridworld.gamma
                    if end == True:
                        break
                    elif t == 100:
                        break
                    state = nextstate
                expected_reward += G
                if(count != -1):
                    returns[trial][count] = G
                    count += 1
                gridworld.reset()

            return expected_reward / numEpisodes


        agent = FCHC(theta, sigma, evaluateFunction, numEpisodes)
        
        count = 0

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes : count])
            print(np.mean(returns[trial][iteration * numEpisodes : count]))
            l = [[0 for i in range(5)] for j in range(5)] 
            for i in range(25):
                k = tabular_softmax.getActionProbabilities(i)
                print(k)
                r = np.argmax(k)
                if(r == 0):
                    l[i//5][i % 5] = '↑'
                elif(r == 1):
                    l[i//5][i % 5] = '↓'
                elif(r == 2):
                    l[i//5][i % 5] = '←'
                elif(r == 3):
                    l[i//5][i % 5] = '→'

            for i in range(5):
                print(l[i])
        print(p)
            
    plot(returns, 'More-Watery 687-Gridworld domain First Choice Hill Climbing (standard deviation error bars) - 50 trials', 3)

示例#13

0

显示文件

文件： evaluate.py 项目： kshitimehta/RL-Project

 def reset(self):
     self.environment = Gridworld()
     self.policy = TabularSoftmax(25,4)
     self._G = []

示例#14

0

显示文件

文件： evaluate.py 项目： kshitimehta/RL-Project

 def __init__(self):
     self.environment = Gridworld()
     self.policy = TabularSoftmax(25,4)
     self._G = []