def problemA():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    # setting random seed for reproducibility
    print("Problem A")

    env = Gridworld()

    discounted_returns = []
    for episode in range(10000):
        # print (episode)
        discounted_return = 0.0
        while not env.isEnd:
            state = env.state
            action = np.random.choice([0, 1, 2, 3])
            # print (state, action)
            actual_action, new_state, reward = env.step(action)
            # print (actual_action, new_state, reward)
            discounted_return += reward
            # print (t)
        env.reset()
        # print(time_step)
        discounted_returns.append(discounted_return)

    print("Mean ", np.mean(discounted_returns))
    print("Std Dev ", np.std(discounted_returns))
    print("Max ", np.max(discounted_returns))
    print("Min ", np.min(discounted_returns))

    return discounted_returns
    """
示例#2
0
def problemC():
    print("PROBLEM C...")
    policy = np.array([
        3, 3, 3, 3, 1, 0, 3, 3, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 3, 3,
        4
    ])
    episodes = 10000
    arr = np.zeros(episodes)
    G = Gridworld()
    G.gamma = 0.9
    for e in range(episodes):
        G.timestep = 0
        #        print("episode %d" % (e+1))
        while (not G.isEnd):
            #  print(G.currentState)
            G.step(G.stoch_action(policy[G.state]))
        arr[e] = G.reward
        G.reset()


#        arr[e] = disc_returns
    opt_disc_returns = np.amax(arr)
    opt_episode = np.argmax(arr) + 1
    mean = np.mean(arr)
    variance = np.var(arr)
    std_dev = np.std(arr)
    min = np.amin(arr)
    print("Highest observed discounted returns is %f achieved in"
          " episode number %d" % (opt_disc_returns, opt_episode))
    print("The mean of discounted returns is %f, variance is %f"
          " and standard deviation is %f" % (mean, variance, std_dev))
    print("Max is %f and min is %f" % (opt_disc_returns, min))
    #    print(np.argmin(arr) + 1)
    return arr
示例#3
0
class Evaluate:
    def __init__(self):
        self.environment = Gridworld()
        self.policy = TabularSoftmax(25,4)
        self._G = []
    
    @property
    def batchReturn(self)->str:
        return self._G
    
    def __call__(self, theta:np.array, numEpisodes:int):
#	    print("Evaluating Gridworld")
#        self._G = [] #reset G at every call
	    # environment = Gridworld()
        # policy = TabularSoftmax(25,4)
        self.policy.parameters = theta
#        print("numEpisodes",numEpisodes)
        
        Count = 200
        
        for episode in range(numEpisodes):
	        
            self.environment.reset()
            G_episode = 0
            
            counter = 0
            ctr=0
            while not self.environment.isEnd:

                if(counter>=Count):
                    G_episode = -50
                    break
                state = self.environment.state
                action = self.policy.samplAction(state)
                _, reward, _ = self.environment.step(action)
                
                G_episode += (self.environment.gamma**ctr)*reward
#                G_episode += reward
                
                counter+=1
                ctr+=1
	        # self.returns.append(Gi)
            self._G.append(G_episode)
#            if (episode % 50 == 0):
#                print(G_episode)

	    # print("Mean Return ", np.mean(G))
        return np.mean(self._G)
    
    def reset(self):
        self.environment = Gridworld()
        self.policy = TabularSoftmax(25,4)
        self._G = []
示例#4
0
def problemA():
    print("PROBLEM A...")
    episodes = 10000
    arr = np.zeros(episodes)
    G = Gridworld()
    G.gamma = 0.9
    for e in range(episodes):  # number of episodes loop
        G.timeStep = 0
        # print("episode %d" % (e+1))
        while (not G.isEnd):
            # print(G.currentState)
            G.step(G.action)
        arr[e] = G.reward
        G.reset()

    opt_disc_returns = np.amax(arr)
    opt_episode = np.argmax(arr) + 1
    mean = np.mean(arr)
    variance = np.var(arr)
    std_dev = np.std(arr)
    min = np.amin(arr)
    print("Highest observed discounted returns is %f achieved in"
          " episode number %d" % (opt_disc_returns, opt_episode))
    print("The mean of discounted returns is %f, variance is %f"
          " and standard deviation is %f" % (mean, variance, std_dev))
    print("Max is %f and min is %f" % (opt_disc_returns, min))
    return arr
示例#5
0
def problemE():
    env = Gridworld(startState=18)
    env.gamma = 0.9
    episode = 0
    hit = 0
    total_try = 100000
    while episode < total_try:
        episode += 1
        env.timeStep = 8
        while env.timeStep < 19:
            act = env.action
            state, reward, isEnd = Gridworld.step(env, act)
            if isEnd:
                break
        if env.currentState == 21:
            hit += 1
    print('P is {}'.format(hit / total_try))
示例#6
0
def problemE(num_iters):
    agent = Agent()
    gridworld = Gridworld()
    count = 0

    for i in range(num_iters):
        gridworld.state = 19  #defining the state to be above end
        for i in range(8, 19):
            time = i  #not used anywhere, just for clarity purpose
            action = agent.act(
            )  #this will be action a_18 in the last iteration
            gridworld.step(action)

        if gridworld.state == 22:
            count += 1

    print('P(S_19 = 22| S_8=19) = ', count / num_iters)
示例#7
0
def runEnvironment(getAction, numeps=10000):
    returns = np.zeros(numeps)

    grid = Gridworld()
    for ep in range(numeps):
        grid.reset()
        step = 0
        g = 0
        while not grid.isEnd:
            s, r, e = grid.step(getAction(grid.state))
            g += (grid.gamma ** step) * r
            step += 1
        returns[ep] = g

    print("Average: {}\nStandard Deviation: {}\nMin: {}\nMax: {}".format( \
        np.mean(returns), np.std(returns), np.min(returns), np.max(returns)))
    return returns
示例#8
0
def run_gridworld_episode(p):
    environment = Gridworld()
    policy = TabularSoftmax(25, 4)
    policy.parameters = p
    is_end = False
    discounted_return = 0
    t = 0
    while not is_end:
        action = policy.samplAction(environment.state)
        new_state, reward, is_end = environment.step(action)
        discounted_return += (environment.gamma**t) * reward
        t += 1
        if t > 200:
            discounted_return = -50
            break
    environment.reset()
    return discounted_return
示例#9
0
def problemE():
    """
    Using simulations,  empirically estimate the probability that S_19=21
    given that S_8=18 (the state above the goal) when running the
    uniform random policy.  Describe how you estimated this quantity (there
    is not a typo in this problem, nor an oversight)
    NOTE: State 18 is state 19 in this gridworld implementation and state 21 is 22.
    """
    print("\nProblem E")
    env = Gridworld()
    success = 0
    N = 100000
    for trial in range(N):
        env.reset()
        env._state = 19
        step = 0

        while not env.isEnd:
            env.step(np.random.choice(range(4)))
            step += 1
            if step == 11:
                break
        if env._state == 22:
            success += 1
    p = success / N
    eps = np.sqrt((1 / (2 * N)) * np.log(2 / 0.05))  # Hoeffding's inequality
    print(
        "Pr(S_19=s_22 | S_8=s_18)={0:.5f} empirically and is in ({1:.5f},{2:.5f}) with 95% confidence using Hoeffding's inequality".format(
            p, p - eps, p + eps))
示例#10
0
def runEnvironment_gridworld(policy, numeps=10000):
    returns = np.zeros(numeps)

    grid = Gridworld()
    for ep in range(numeps):
        grid.reset()
        step = 0
        g = 0
        while not grid.isEnd:
            action = policy.samplAction(grid.state)
            s, r, e = grid.step(action)
            g += (grid.gamma**step) * r
            step += 1
            if step > 200:
                g = -50
                break
        returns[ep] = g
    return returns
示例#11
0
def problemA():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed
    discounted returns.
    """
    grid_world = Gridworld()
    rewards = []
    for episod in range(10000):
        is_end = False
        grid_world.reset()
        r = 0

        while ~is_end:
            action = np.random.randint(4)
            r_, is_end = grid_world.step(action)
            r += r_
        rewards.append(r)
        print(episod, r, is_end)
    rewards = np.array(rewards)
    print(rewards.mean(), rewards.std(), rewards.max(), rewards.min())
def problemE():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    # setting random seed for reproducibility
    print("Problem E")
    start_time = time.time()

    env = Gridworld(startState=19)
    num_episodes = 1000000
    count_s19_22_given_s8_19 = 0
    for episode in range(num_episodes):
        # print (episode)
        time_step = 0
        while (not env.isEnd) and time_step < 12:
            state = env.state
            if time_step == 11 and state == 22:
                count_s19_22_given_s8_19 += 1
            action = np.random.choice([0, 1, 2, 3])
            env.step(action)
            time_step += 1
            # print (t)
        env.reset()
    print(count_s19_22_given_s8_19)
    Pr_s19_22_given_s8_19 = (count_s19_22_given_s8_19 * 1.0) / num_episodes

    end_time = time.time()
    print("Estimate of Pr(S_8=19 | S_19 = 22) = ", Pr_s19_22_given_s8_19)
    print("Execution time = ", end_time - start_time)
    """
示例#13
0
def problemA(num_iters):
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    agent = Agent()
    discounted_returns = []
    gridworld = Gridworld()

    for i in range(num_iters):
        reward = 0
        time = 0
        while True:
            action = agent.act()
            gridworld.step(action)
            reward += gridworld.reward * (gridworld.gamma**time)
            if gridworld.isEnd:
                break
            time += 1
        discounted_returns.append(reward)
        gridworld.reset()

    print('Mean = ', st.mean(discounted_returns))
    print('Standard deviation = ', st.stdev(discounted_returns))
    print('Max = ', max(discounted_returns))
    print('Min = ', min(discounted_returns))

    return discounted_returns
示例#14
0
def problemC(num_iters):
    """
    Find an optimal policy (you may do this any way you choose,
    including by reasoning through the problem yourself). Report the optimal
    policy here. Comment on whether it is unique
    """
    agent = Agent()
    discounted_returns = []
    gridworld = Gridworld()
    print("acting optimally")

    for i in range(num_iters):
        reward = 0
        time = 0
        while True:
            action = agent.actOptimally(gridworld.state)
            gridworld.step(action)
            reward += gridworld.reward * (gridworld.gamma**time)
            if gridworld.isEnd:
                break
            time += 1
        discounted_returns.append(reward)
        gridworld.reset()

    print('Mean = ', st.mean(discounted_returns))
    print('Standard deviation = ', st.stdev(discounted_returns))
    print('Max = ', max(discounted_returns))
    print('Min = ', min(discounted_returns))

    return discounted_returns
def problemB():
    """
    Run the optimal policy that you found for 10,000 episodes. Repor the 
    mean, standard deviation, maximum, and minimum of the observed 
    discounted returns
    """
    print("Problem B")

    optimal_policy_actions = [
        1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, -1, 2, 2, 0, 3, -1, 1, 2, 0, 3, 1,
        1, -1
    ]

    env = Gridworld()

    discounted_returns = []
    for t in range(10000):
        # print (t)
        discounted_return = 0.0
        while not env.isEnd:
            state = env.state
            action = optimal_policy_actions[state]
            # print (state, action)
            actual_action, new_state, reward = env.step(action)
            # print (actual_action, new_state, reward)
            discounted_return += reward
        discounted_returns.append(discounted_return)
        env.reset()

    print("Mean ", np.mean(discounted_returns))
    print("Std Dev ", np.std(discounted_returns))
    print("Max ", np.max(discounted_returns))
    print("Min ", np.min(discounted_returns))

    return discounted_returns
    # plt.hist(sorted(discounted_returns), density = True, cumulative=True, label='CDF',
    #      histtype='step', alpha=0.8, color='k')
    # plt.show()
    """
示例#16
0
def problemE():
    print("PROBLEM E...")
    episodes = 10000
    count = 0
    G = Gridworld(startState=19)
    G.gamma = 0.9
    for e in range(episodes):
        G.timeStep = 0
        while ((G.timeStep < 11) and (not G.isEnd)):
            G.step(G.action)
        if G.state == 22:
            count = count + 1
        G.reset()
    print("The empirical probability of S19 = 21 given S8 = 18 is %f" %
          (count / episodes))
示例#17
0
 def __call__(self, parameters: np.array, numEpisodes: int):
     # print("Evaluating Gridworld")
     G = []
     policy = TabularSoftmax(25, 4)
     policy.parameters = parameters
     env = Gridworld()
     for ep in range(numEpisodes):
         # print("Episode ", ep)
         env.reset()
         Gi = 0
         timeStep = 0
         while not env.isEnd:
             state = env.state
             action = policy.samplAction(state)
             _, next_state, reward = env.step(action)
             Gi += reward
             timeStep += 1
             if timeStep == 200:
                 Gi += -50
                 break
         G.append(Gi)
         self.curTrialReturns.append(Gi)
     print("Mean Return ", np.mean(G))
     return np.mean(G)
示例#18
0
def problemA():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    time_list = []
    reward_list = []
    env = Gridworld()
    env.gamma = 0.9

    episode = 0
    while episode <= 10000:
        episode += 1
        print('Episode {}'.format(episode))
        step = 0
        totalReward = 0
        reached = False
        while True:
            step += 1
            act = env.action
            state, reward, isEnd = Gridworld.step(env, act)
            # reward_list.append(reward)
            totalReward += reward
            if isEnd:
                reached = True
                print('Steps take: {}\tTotal reward: {:.4f}'.format(
                    step, totalReward))
                break
        if not reached:
            episode -= 1
            continue
        Gridworld.reset(env)
        reward_list.append(totalReward)
    print('finished')

    reward_array = np.array(reward_list)
    mean = reward_array.mean()
    std = reward_array.std()
    max = reward_array.max()
    min = reward_array.min()

    print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format(
        mean, std, max, min))
    print('Num of reward: {}'.format(len(reward_list)))
    with open('./resultA.json', 'w') as file:
        json.dump(reward_list, file)
示例#19
0
def problem1():
    """
    Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular 
    softmax policy. Search the space of hyperparameters for hyperparameters 
    that work well. Report how you searched the hyperparameters, 
    what hyperparameters you found worked best, and present a learning curve
    plot using these hyperparameters, as described in class. This plot may be 
    over any number of episodes, but should show convergence to a nearly 
    optimal policy. The plot should average over at least 500 trials and 
    should include standard error or standard deviation error bars. Say which 
    error bar variant you used. 
    """

    #TODO
    
    popSize = 10 #10
    numElite = 5 #5
    epsilon = 4.0 #4.0
    sigma = 1.0 #1.0
    numEpisodes = 20 #50
    numTrials = 5 #5
    numIterations = 50 #200

    returns = np.zeros((numTrials, numEpisodes * numIterations))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        gridworld = Gridworld()

        tabular_softmax = TabularSoftmax(25, 4)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0

        def evaluateFunction(theta, numEpisodes):
            nonlocal count
示例#20
0
def problem3():
    """
    Repeat the previous question, but using the GA (as described earlier in 
    this assignment) on the More-Watery 687-Gridworld domain. Report the same 
    quantities.
    """

    #TODO
    
    populationSize = 40 # 40
    numElite = 20 # 20
    numEpisodes = 20 # 20
    numTrials = 50 #50
    numIterations = 100 # 100
    Kp = 30 # 30
    alpha = 3.0 # 3.0

    returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        gridworld = Gridworld()

        tabular_softmax = TabularSoftmax(25, 4)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = 0


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 10000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = gridworld.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = gridworld.step(action)
                    G += (discount) * reward
                    discount *= gridworld.gamma
                    if end == True:
                        break
                    elif t == 200:
                        G = -50
                        break
                    state = nextstate
                expected_reward += G
                returns[trial][count] = G
                gridworld.reset()
                count += 1

            return expected_reward / numEpisodes
        
        def initPopulation(populationSize : int) -> np.ndarray:
            return np.random.randn(populationSize, tabular_softmax.parameters.shape[0])


        agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha)

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes * populationSize : count])
            print(np.mean(returns[trial][iteration * numEpisodes * populationSize : count]))
            l = [[0 for i in range(5)] for j in range(5)] 
            for i in range(25):
                k = tabular_softmax.getActionProbabilities(i)
#                 print(k)
                r = np.argmax(k)
                if(r == 0):
                    l[i//5][i % 5] = '↑'
                elif(r == 1):
                    l[i//5][i % 5] = '↓'
                elif(r == 2):
                    l[i//5][i % 5] = '←'
                elif(r == 3):
                    l[i//5][i % 5] = '→'

            for i in range(5):
                print(l[i])
        print(p)
            
    plot(returns, 'More-Watery 687-Gridworld domain Genetic Algorithm (standard deviation error bars) - 50 trials', 3)
示例#21
0
def problem2():
    """
    Repeat the previous question, but using first-choice hill-climbing on the 
    More-Watery 687-Gridworld domain. Report the same quantities.
    """
    
    #TODO
    
    
    sigma = 1.0 #1.0

    numEpisodes = 200 #200
    numTrials = 50 #50
    numIterations = 200 #200

    returns = np.zeros((numTrials, numEpisodes * numIterations))
    
    for trial in range(numTrials):
        

        np.random.seed(np.random.randint(10000))
    
        gridworld = Gridworld()

        tabular_softmax = TabularSoftmax(25, 4)
        theta = np.random.randn(tabular_softmax.parameters.shape[0])
        
        count = -1


        def evaluateFunction(theta, numEpisodes):
            nonlocal count

            expected_reward = 0

            numTimeSteps = 10000
            tabular_softmax.parameters = theta

            for episode in range(numEpisodes):
                state = gridworld.state
                G = 0
                discount = 1
                for t in range(numTimeSteps):
                    action = tabular_softmax.samplAction(state);
                    nextstate, reward, end = gridworld.step(action)
                    G += (discount) * reward
                    discount *= gridworld.gamma
                    if end == True:
                        break
                    elif t == 100:
                        break
                    state = nextstate
                expected_reward += G
                if(count != -1):
                    returns[trial][count] = G
                    count += 1
                gridworld.reset()

            return expected_reward / numEpisodes


        agent = FCHC(theta, sigma, evaluateFunction, numEpisodes)
        
        count = 0

        for iteration in range(numIterations):
        
            print("Trial: %d" % (trial, ))
            print("Iteration: %d" % (iteration, ))
            p = agent.train()
            print(returns[trial][iteration * numEpisodes : count])
            print(np.mean(returns[trial][iteration * numEpisodes : count]))
            l = [[0 for i in range(5)] for j in range(5)] 
            for i in range(25):
                k = tabular_softmax.getActionProbabilities(i)
                print(k)
                r = np.argmax(k)
                if(r == 0):
                    l[i//5][i % 5] = '↑'
                elif(r == 1):
                    l[i//5][i % 5] = '↓'
                elif(r == 2):
                    l[i//5][i % 5] = '←'
                elif(r == 3):
                    l[i//5][i % 5] = '→'

            for i in range(5):
                print(l[i])
        print(p)
            
    plot(returns, 'More-Watery 687-Gridworld domain First Choice Hill Climbing (standard deviation error bars) - 50 trials', 3)
示例#22
0
 def reset(self):
     self.environment = Gridworld()
     self.policy = TabularSoftmax(25,4)
     self._G = []
示例#23
0
 def __init__(self):
     self.environment = Gridworld()
     self.policy = TabularSoftmax(25,4)
     self._G = []
示例#24
0
def problemB():
    """
    Run the optimal policy that you found for 10,000 episodes. Repor the 
    mean, standard deviation, maximum, and minimum of the observed 
    discounted returns
    """
    # if on the upper edge, move right; if on right edge, move down;
    # else, move right or down
    env = Gridworld()
    env.gamma = 0.9
    episode = 0
    reward_list = []
    # obstacles = [12, 17]
    # waterStates = [6, 18, 22]
    # upperBounds = [0, 1, 2, 3, 4]
    # rightBounds = [4, 9, 14, 19, 24]

    while episode < 10000:
        episode += 1
        print('Episode {}'.format(episode))
        step = 0
        totalReward = 0
        reached = False
        while step < 10000:
            step += 1
            if env.currentState in env.rightBounds:
                act = 2  # Move down
            elif env.currentState in env.upperBounds:
                act = 3  # Move right
            else:
                if random.random() < 0.5:
                    act = 3
                else:
                    act = 2
            # secure = False
            # while not secure:
            #     if act == 2:
            #         nextState = env.currentState + 5
            #         if nextState in env.waterStates or nextState in env.obstacles:
            #             act = 3
            #         else:
            #             secure = True
            #     else:
            #         nextState = env.currentState + 1
            #         if nextState in env.waterStates or nextState in env.obstacles:
            #             act = 2
            #         else:
            #             secure = True

            state, reward, isEnd = Gridworld.step(env, act)
            totalReward += reward
            if isEnd:
                reached = True
                print('Steps take: {}\tTotal reward: {:.4f}'.format(
                    step, totalReward))
                break
        if not reached:
            episode -= 1
            continue
        Gridworld.reset(env)
        reward_list.append(totalReward)
    print('finished')

    reward_array = np.array(reward_list)
    mean = reward_array.mean()
    std = reward_array.std()
    max = reward_array.max()
    min = reward_array.min()

    print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format(
        mean, std, max, min))
    print('Num of reward: {}'.format(len(reward_list)))
    with open('./resultB.json', 'w') as file:
        json.dump(reward_list, file)