Exemplo n.º 1
0
def buildTwoMDPs():
    n = 10
    rewards = np.zeros(4)
    probabilities = np.zeros(2)
    myMDPs = []

    rewards[1] = penalty_handicap = -10
    rewards[2] = penalty_collision = -100
    probabilities[0] = prob_avail_handicap = 0.9
    probabilities[
        1] = prob_T = 5.0  # probability temperature: lower - most occupied, higher - most available
    """ 1st set of parameter values for less cost of driving and high reward for closest parking spot"""
    myMDP1 = mdp.MDP(n)
    MDPname = "myMDP1.txt"
    rewards[0] = penalty_driving = -1
    rewards[3] = best_reward = 100
    myMDP1.make_MDP(MDPname, rewards, probabilities)
    myMDPs.append(myMDP1)
    """ 2nd set of parameter values for high cost of driving and less reward for closest parking spot"""
    myMDP2 = mdp.MDP(n)
    MDPname = "myMDP2.txt"
    rewards[0] = penalty_driving = -10
    rewards[3] = best_reward = 10
    myMDP2.make_MDP(MDPname, rewards, probabilities)
    myMDPs.append(myMDP2)

    return myMDPs
Exemplo n.º 2
0
def generate_random_MDP(X,
                        U,
                        B,
                        std=0,
                        random_state=np.random.RandomState(0),
                        R_mode=DEPEND_ONLY_ON_START_STATE):
    '''
    :param X: state size
    :param U: actions size
    :param B: Branching factor
    :param std:
    :param random_state:
    '''
    P = np.zeros(shape=(U, X, X))
    R = np.zeros(shape=(U, X, X))
    R_std = std * np.ones(shape=(U, X, X))

    for x in range(X):
        for u in range(U):
            P[u, x], R[u, x] = get_random_sparse_vector(X, B, random_state)
        if R_mode == DEPEND_ONLY_ON_START_STATE:
            R[0, x, :] = R[0, x, 0]
            R[u, x, :] = R[0, x, 0]

    mdp = MDP.MDP(P=P, R=R, R_std=R_std)
    return mdp
Exemplo n.º 3
0
    def __init__(self,
                 initial,
                 nrows=8,
                 ncols=8,
                 robotmdp=MDP(),
                 targets_path=[[]],
                 obstacles=[],
                 size=16,
                 task_type='sequential'):
        super(GridworldGui, self).__init__(initial, nrows, ncols, robotmdp,
                                           targets_path, obstacles, task_type)
        # compute the appropriate height and width (with room for cell borders)

        self.height = nrows * size + nrows + 1
        self.width = ncols * size + ncols + 1
        self.size = size

        # initialize pygame ( SDL extensions )
        pygame.init()
        pygame.display.set_mode((self.width, self.height))
        pygame.display.set_caption('Gridworld')
        self.screen = pygame.display.get_surface()
        self.surface = pygame.Surface(self.screen.get_size())
        self.bg = pygame.Surface(self.screen.get_size())
        self.bg_rendered = False  # optimize background render

        self.background()
        self.screen.blit(self.surface, (0, 0))
        pygame.display.flip()

        self.build_templates()
        self.updategui = True  # switch to stop updating gui if you want to collect a trace quickly

        self.current = self.mdp.init  # when start, the current state is the initial state
        self.state2circle(self.current)
Exemplo n.º 4
0
    def __init__(self,
                 network_file='network_topology',
                 initial=0,
                 targets=[],
                 num_obstacles=0,
                 T=1,
                 task_type='sequential',
                 visualization=False,
                 decoys_set=[]):
        self.dg = nx.read_gml(network_file)

        edge_number = []
        dead_end = set()
        for index in range(self.dg.number_of_nodes()):
            if self.dg.out_degree(str(index)) == 0:
                dead_end.add(index)
            edge_number.append(len(list(self.dg.neighbors(str(index)))))
        self.action_number = max(edge_number)

        self.current = initial
        self.nstates = self.dg.number_of_nodes()
        self.actlist = ["a" + str(e) for e in range(self.action_number)]
        self.T = T
        self.num_obstacles = num_obstacles
        self.task_type = task_type
        self.visualization = visualization
        self.decoys_set = decoys_set

        self.obstacles_combo = []
        self.obstacles = np.asarray([])
        self.sample_obstacles()

        self.horizon = self.nstates - 1
        self.target_index = 0
        self.time_index = 0
        self.configuration_index = 0
        self.p = 0.5

        self.targets = np.asarray(targets)
        if self.task_type == 'sequential':
            self.dead_end = dead_end.difference(
                [self.targets[self.target_index]])
            acc = np.full(self.horizon,
                          self.targets[self.target_index],
                          dtype=np.int)
        else:
            self.dead_end = dead_end.difference(self.targets)
            acc = np.ones(
                (self.horizon, self.targets.size), dtype=np.int) * self.targets

        self.mdp = MDP(initial,
                       self.actlist,
                       range(self.nstates + 1),
                       acc=acc,
                       obstacles=np.ones(
                           (self.horizon, self.obstacles.size), dtype=np.int) *
                       self.obstacles,
                       horizon=self.horizon)

        self.mdp.prob = self.getProbs()
Exemplo n.º 5
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):
        '''Model-based Reinforcement Learning with epsilon greedy 
        exploration.  This function should use value iteration,
        policy iteration or modified policy iteration to update the policy at each step

        Inputs:
        s0 -- initial state
        defaultT -- default transition function when a state-action pair has not been vsited
        initialR -- initial estimate of the reward function
        nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0
        nSteps -- # of steps per episode
        epsilon -- probability with which an action is chosen at random

        Outputs: 
        V -- final value function
        policy -- final policy
        '''

        model = MDP.MDP(defaultT, initialR, self.mdp.discount)
        V = np.zeros(model.nStates)
        policy = np.zeros(model.nStates, int)

        count_sa = np.zeros((model.nStates, model.nActions)).astype(float)
        count_sas = np.zeros(
            (model.nStates, model.nActions, model.nStates)).astype(float)

        c_reward = np.zeros(nEpisodes)
        for i in range(nEpisodes):
            state = s0
            for j in range(nSteps):
                action = policy[state]
                if random.uniform(0, 1) < epsilon:
                    action = random.randint(0, model.nActions - 1)

                reward, nextState = self.sampleRewardAndNextState(
                    state, action)
                c_reward[i] += reward * (model.discount**j)

                count_sa[state, action] += 1.0
                count_sas[state, action, nextState] += 1.0

                model.T[action,
                        state, :] = np.divide(count_sas[state, action, :],
                                              count_sa[state, action])
                model.R[action,
                        state] = (reward + (count_sa[state, action] - 1.0) *
                                  model.R[action, state]) / count_sa[state,
                                                                     action]
                policy, V, _ = model.policyIteration(policy)

                state = nextState

        return [V, policy, c_reward]
Exemplo n.º 6
0
def find_schedule(M, LV, GV, N, delta, due_dates, release_dates, ALPHA, GAMMA,
                  EPSILON, EPOCHS, METHOD, STACT):

    # Generate heuristics for Q_learning rewards
    heur_job = heuristic_best_job(delta, LV, GV, N)
    heur_res = heuristic_best_resource(heur_job)
    heur_order = heuristic_order(delta, LV, GV, N)

    if STACT == "st_act":  # st_act for state-action pairs, act for only actions
        policy_init = np.zeros([2**N, N + 1])  # states, actions
    if STACT == "act":  # st_act for state-action pairs, act for only actions
        policy_init = np.zeros([N + 1])  # actions

    RL = MDP(LV, GV, N, policy_init, due_dates,
             release_dates)  # initialize MDP
    r_best = 99999
    best_schedule = []
    best_policy = np.zeros([LV, N + 1])
    epoch_best_found = 0
    timer_start = time.time()
    for epoch in range(EPOCHS):
        # if epoch%100==0:
        #     print(epoch)

        DONE = False
        z = 0
        RL.reset(due_dates, release_dates, LV, GV, N)

        # take timesteps until processing of all jobs is finished
        while not DONE:
            RL, DONE = RL.step(z, GV, N, METHOD, delta, ALPHA, GAMMA, EPSILON,
                               STACT, heur_job, heur_res, heur_order)
            z += 1

        schedule = RL.schedule.objectives()
        r = schedule.Cmax
        if r < r_best:
            r_best = r
            best_schedule = schedule
            epoch_best_found = epoch

            for i in range(len(RL.resources)):
                best_policy[i] = RL.resources[i].policy

            if METHOD == "JEPS":
                resources = RL.resources
                states = RL.states
                actions = RL.actions

                for i in range(len(resources)):
                    resource = update_policy_JEPS(resources[i], states,
                                                  actions, r_best, z, GAMMA,
                                                  STACT)
                    RL.resources[i] = resource

    timer_finish = time.time()
    calc_time = timer_finish - timer_start
    return r_best, best_schedule, best_policy, epoch_best_found, calc_time, RL
Exemplo n.º 7
0
    def calc_policy(self, main_index, other_policy=None):
        if other_policy is None:
            other_policy = np.ones((self.mdp_a[1 - main_index], self.s))
            other_policy /= np.sum(other_policy, axis=0)
        t, r = self.get_tr_with_others_policy(main_index, other_policy)

        mdp = MDP.MDP(t.shape[1], t.shape[0], self.d)
        mdp.t = t
        mdp.r = r
        return self.solver.get_greedy_policy(mdp)
Exemplo n.º 8
0
def evaluate_TD():
    print("Creating MDP.")
    mdp = MDP.MDP(10, 3)
    print("Running TD State-Value Estimation.")
    ts = []
    for i in range(1, 20):
        print(str(i) + "...")
        v = Temporal_difference.estimate(mdp, 0, 0.01, 30, 100, lambda x: 1)
        print("\t" + str(v[0]))
        ts.append(v[0])
    plot(ts)
Exemplo n.º 9
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    rubiks_MDP = MDP.MDP()
    rubiks_MDP.register_start_state("wwoobbggrrrryyyyoowwggbb")
    rubiks_MDP.register_actions(Test_Rubiks.ACTIONS)
    rubiks_MDP.register_operators(Test_Rubiks.OPERATORS)
    #rubiks_MDP.generateAllStates()
    #print("Total number of generated states: " + str(len(rubiks_MDP.known_states)))
    rubiks_MDP.register_transition_function(Test_Rubiks.T)
    rubiks_MDP.register_reward_function(Test_Rubiks.R)
    #rubiks_MDP.random_episode(1000)
    rubiks_MDP.QLearning(0.98, 1, 0.1)
Exemplo n.º 10
0
    def __init__(self):

        self.MDP = Diagram.MDP()

        self.lambdaQ: float = 0.99

        self.alpha: float = 0.1

        self.threshold: float = 0.001

        self.change_rate: float = 0.99

        self.q_value_dict = self.MDP.get_States()
Exemplo n.º 11
0
def evaluate_policy():
    print("Creating MDP.")
    mdp = MDP.MDP(10, 3)
    print("Running Monte Carlo State-Value Estimation.")
    TS = []

    for i in range(5, 8):
        print(str(i) + "...")
        v0 = Monte_Carlo.first_visit_eval(mdp, 0, 0.01, 10, i * 10,
                                          lambda x: 1)[0]
        print(" " + str(v0))
        TS.append(v0[0])
    plot(TS)
Exemplo n.º 12
0
def test():
    cube_MDP = MDP.MDP()
    cube_MDP.register_start_state(createInitialState())
    cube_MDP.register_actions(ACTIONS)
    cube_MDP.register_operators(OPERATORS)
    cube_MDP.register_transition_function(T)
    cube_MDP.register_reward_function(R)
    cube_MDP.register_describe_state(describeState)
    cube_MDP.register_goal_test(goalTest)
    cube_MDP.register_action_to_op(ACTION_TO_OP)
    cube_MDP.generateAllStates()
    cube_MDP.QLearning(0.8, 1000, 0.2)
    displayOptimalPolicy(cube_MDP)
Exemplo n.º 13
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    grid_MDP = MDP.MDP()
    grid_MDP.register_start_state((0, 0))
    grid_MDP.register_actions(ACTIONS)
    grid_MDP.register_operators(OPERATORS)
    grid_MDP.register_transition_function(T)
    grid_MDP.register_reward_function(R)
    grid_MDP.random_episode(100)
    grid_MDP.generateAllStates()
    grid_MDP.ValueIterations(0.9, 100)
    grid_print(grid_MDP.V)
    grid_MDP.QLearning(0.9, 1000, 0.05)
    QPrinter(grid_MDP.QValues)
    policyPrint(grid_MDP)
Exemplo n.º 14
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):

        cum_rewards = np.zeros((nEpisodes))

        cumActProb = np.cumsum(np.ones(self.mdp.nActions) / self.mdp.nActions)
        freq = np.zeros(
            [self.mdp.nActions, self.mdp.nStates, self.mdp.nStates])
        T = defaultT
        R = initialR
        model = MDP.MDP(T, R, self.mdp.discount)
        [policy, V, _] = model.policyIteration(np.zeros(model.nStates, int))
        for episId in xrange(nEpisodes):
            state = s0
            for iterId in xrange(nSteps):

                # choose action
                if epsilon > np.random.rand(1):
                    action = np.where(cumActProb >= np.random.rand(1))[0][0]
                else:
                    action = policy[state]

                # sample reward and next state
                [reward,
                 nextState] = self.sampleRewardAndNextState(state, action)
                cum_rewards[episId] += (self.mdp.discount**iterId) * reward

                # update counts
                freq[action, state, nextState] += 1
                asFreq = freq[action, state, :].sum()

                # update transition
                T[action, state, :] = freq[action, state, :] / asFreq

                # update reward
                R[action,
                  state] = (reward + (asFreq - 1) * R[action, state]) / asFreq

                # update policy
                [policy, V, _] = model.policyIteration(policy)

                state = nextState
        return [V, policy, cum_rewards]
Exemplo n.º 15
0
def generate_investment_sim(p_noise=0, **kwargs):
    P = np.array([[[1 - p_noise, p_noise], [1 - p_noise, p_noise]],
                  [[p_noise, 1 - p_noise], [p_noise, 1 - p_noise]]])
    R_state_1 = kwargs.get("R_state_1", 2)
    R1_std = kwargs.get("R1_std", math.sqrt(2))
    R1D = np.array([1, R_state_1])
    R1D_std = np.array([0, R1_std])
    R = OneDVec2ThreeDVec(R1D, U=2)
    R_std = OneDVec2ThreeDVec(R1D_std, U=2)

    sparse_flag = kwargs.get("sparse_flag", False)
    if sparse_flag:
        pass
    else:
        mdp = MDP.MDP(P=P, R=R, R_std=R_std)
    return mdp
Exemplo n.º 16
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    grid_MDP = MDP.MDP()
    grid_MDP.register_start_state((0, 0))
    grid_MDP.register_actions(ACTIONS)
    grid_MDP.register_operators(OPERATORS)
    grid_MDP.register_transition_function(T)
    grid_MDP.register_reward_function(R)
    #grid_MDP.random_episode(100)
    grid_MDP.generateAllStates()
    grid_MDP.valueIterations(0.9, 10)
    displayV(grid_MDP.V)
    grid_MDP.QLearning(0.9, 20000, 0.05)
    displayQ(grid_MDP.QValues)
    grid_MDP.extractPolicy(grid_MDP.QValues)
    displayOptimalPolicy(grid_MDP.optPolicy)
Exemplo n.º 17
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    global grid_MDP
    grid_MDP = MDP.MDP()
    grid_MDP.register_start_state((0, 0))
    grid_MDP.register_actions(ACTIONS)
    grid_MDP.register_operators(OPERATORS)
    grid_MDP.register_transition_function(T)
    grid_MDP.register_reward_function(R)
    grid_MDP.random_episode(100)
    grid_MDP.generateAllStates()
    grid_MDP.ValueIterations(0.9, 100)
    draw_grid_with_V_values(grid_MDP.V, 3, 4)
    grid_MDP.QLearning(0.8, 50, 0.8)
    draw_grid_with_Q_values(grid_MDP.QValues, 3, 4)
    grid_MDP.extractPolicy()
    extractPolicy(grid_MDP.optPolicy)
    return grid_MDP
Exemplo n.º 18
0
def test():
    cube_MDP = MDP.MDP()
    cube_MDP.register_start_state(createInitialState())
    cube_MDP.register_actions(ACTIONS)
    cube_MDP.register_operators(OPERATORS)
    cube_MDP.register_transition_function(T)
    cube_MDP.register_reward_function(R)
    cube_MDP.register_describe_state(describeState)
    cube_MDP.register_goal_test(goalTest)
    cube_MDP.register_action_to_op(ACTION_TO_OP)
    cube_MDP.generateAllStates()
    cube_MDP.random_episode(10)
    cube_MDP.QLearning(0.8, 1000, 0.2)
    displayOptimalPolicy(cube_MDP)


# DO NOT USE Q LEARNING. IT WILL TAKE FOREVER TO GENERATE ALL POSSIBLE STATES. NOT WORTH YOUR TIME.
#test()
Exemplo n.º 19
0
    def BuildPlanner(self, Validate=True):
        """
        Build the planner using the object's map, agent, and path details.

        Args:
            Validate (bool): Check that the objects have all the information to run.
        """
        if Validate:
            # Check that map has all it needs
            if not self.Map.Validate():
                print("ERROR: Map failed to validate. PLANNER-001")
                return None
            # Check that agent's cost and reward dimensions match map.
            if self.Agent.CostDimensions != len(np.unique(
                    self.Map.StateTypes)):
                print(
                    "ERROR: Agent's cost dimensions do not match map object. PLANNER-002"
                )
                return None
            if self.Agent.RewardDimensions != len(set(
                    self.Map.ObjectLocations)):
                print(
                    "ERROR: Agent's reward dimensions do not match map object. PLANNER-003"
                )
                return None
        # Create main MDP object.
        # This assumes that the Map object has a dead exit state.
        # Map's Validate checks this.
        self.MDP = MDP.MDP(self.Map.S + [max(self.Map.S) + 1], self.Map.A,
                           self.Map.T, self.BuildCostFunction(), self.gamma,
                           self.Agent.actionTau)
        self.CriticalStates = [self.Map.StartingPoint]
        self.CriticalStates.extend(self.Map.ObjectLocations)
        self.CriticalStates.extend([self.Map.ExitState])
        # build the costmatrix and store the policies
        [Policies, CostMatrix, DistanceMatrix] = self.Plan(Validate)
        self.Policies = Policies
        self.CostMatrix = CostMatrix
        self.DistanceMatrix = DistanceMatrix
        self.Utilities = None
        self.goalindices = None
Exemplo n.º 20
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''

    # USING RULES FOR 3x3x3
    rubik_MDP = MDP.MDP()
    rubik_MDP.register_start_state(Cubes.START_STATE)
    rubik_MDP.register_actions(Cubes.ActionOps)
    rubik_MDP.register_operators(Cubes.OPERATORS)
    rubik_MDP.register_transition_function(Cubes.T)
    rubik_MDP.register_reward_function(Cubes.threesReward)
    rubik_MDP.register_goal_state(Cubes.GOAL_STATE_THREE)
    rubik_MDP.register_features([
        Cubes.one_side, Cubes.level1complete, Cubes.crosses_complete,
        Cubes.corners_complete
    ])
    rubik_MDP.register_weights([7, 12, 5, 5])
    # grid_MDP.random_episode(100)
    rubik_MDP.generateAllStates()
    # Uncomment the following, when you are ready...
    print("=== Q LEARNING ===")
    rubik_MDP.QLearning(0.3, 2, 0.6)
Exemplo n.º 21
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    grid_MDP = MDP.MDP()
    grid_MDP.register_start_state((0, 0))
    grid_MDP.register_actions(Grid.ACTIONS)
    grid_MDP.register_operators(Grid.OPERATORS)
    grid_MDP.register_transition_function(Grid.T)
    grid_MDP.register_reward_function(Grid.R)
    #grid_MDP.random_episode(100)
    grid_MDP.generateAllStates()

    # Uncomment the following, when you are ready...

    grid_MDP.valueIteration(0.9, 6)
    print(GW_Values_string(grid_MDP.V))

    grid_MDP.QLearning(0.1, 2, 0.1)
    print(GW_QValues_string(grid_MDP.Q))

    grid_MDP.extractPolicy()
    print(GW_Policy_string(grid_MDP.optPolicy))
Exemplo n.º 22
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):
        '''Model-based Reinforcement Learning with epsilon greedy 
        exploration.  This function should use value iteration,
        policy iteration or modified policy iteration to update the policy at each step

        Inputs:
        s0 -- initial state
        defaultT -- default transition function when a state-action pair has not been vsited
        initialR -- initial estimate of the reward function
        nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0
        nSteps -- # of steps per episode
        epsilon -- probability with which an action is chosen at random

        Outputs: 
        V -- final value function
        policy -- final policy
        '''

        # temporary values to ensure that the code compiles until this
        # function is coded
        gamma = self.mdp.discount
        model_mdp = MDP.MDP(defaultT, initialR, gamma)
        nActions = model_mdp.nActions
        nStates = model_mdp.nStates
        V = np.zeros(nStates)
        policy = np.zeros(nStates, int)
        policy, V, _, _ = model_mdp.modifiedPolicyIteration(policy,
                                                            V,
                                                            nIterations=1000)
        n_sa = np.zeros((nStates, nActions))
        n_sa_s_next = np.zeros((nStates, nActions, nStates))
        cumRewards = np.zeros(nEpisodes)
        for epoch in range(nEpisodes):
            s = s0
            a = None
            for t in range(nSteps):
                if np.random.rand() > epsilon:
                    # a = np.argmax(policy)
                    a = policy[s]
                else:
                    a = np.random.choice(nActions)

                n_sa[s, a] += 1
                r, s_next = self.sampleRewardAndNextState(s, a)
                cumRewards[epoch] += r * (gamma**t)
                n_sa_s_next[s, a, s_next] += 1
                model_mdp.T[a, s, :] = n_sa_s_next[s, a, :] / n_sa[s, a]
                model_mdp.R[
                    a, s] = (r +
                             (n_sa[s, a] - 1) * model_mdp.R[a, s]) / n_sa[s, a]
                policy, V, _, _ = model_mdp.modifiedPolicyIteration(
                    policy, V, nIterations=1000)
                s = s_next

        # print('transition function = {}'.format(model_mdp.T))
        # print('reward function = {}'.format(model_mdp.R))
        return [V, policy, cumRewards]
Exemplo n.º 23
0
import numpy as np
import MDP
import RL
''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
T = np.array([[[0.5, 0.5, 0, 0], [0, 1, 0, 0], [0.5, 0.5, 0, 0], [0, 1, 0, 0]],
              [[1, 0, 0, 0], [0.5, 0, 0, 0.5], [0.5, 0, 0.5, 0],
               [0, 0, 0.5, 0.5]]])
R = np.array([[0, 0, 10, 10], [0, 0, 10, 10]])
discount = 0.9
mdp = MDP.MDP(T, R, discount)
rlProblem = RL.RL(mdp, np.random.normal)

# Test Q-learning
[Q,
 policy] = rlProblem.qLearning(s0=0,
                               initialQ=np.zeros([mdp.nActions, mdp.nStates]),
                               nEpisodes=1000,
                               nSteps=100,
                               epsilon=0.3)
print("\nQ-learning results")
print(Q)
print(policy)

# import numpy as np
# import MDP
# import RL
#
#
# ''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
# T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]])
# R = np.array([[0,0,10,10],[0,0,10,10]])
Exemplo n.º 24
0
 def __init__(self):
     
     self.MDP = Diagram.MDP()
     # Learning Rate
     self.alpha = .1
Exemplo n.º 25
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):
        '''Model-based Reinforcement Learning with epsilon greedy 
        exploration

        Inputs:
        s0 -- initial state
        defaultT -- default transition function when a state-action pair has not been vsited
        initialR -- initial estimate of the reward function
        nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0
        nSteps -- # of steps per episode
        epsilon -- probability with which an action is chosen at random

        Outputs: 
        V -- final value function
        policy -- final policy
        '''

        cum_rewards = np.zeros((nEpisodes))

        cumActProb = np.cumsum(np.ones(self.mdp.nActions) / self.mdp.nActions)
        freq = np.zeros(
            [self.mdp.nActions, self.mdp.nStates, self.mdp.nStates])
        T = defaultT
        R = initialR
        model = MDP.MDP(T, R, self.mdp.discount)
        [policy, V, _] = model.policyIteration(np.zeros(model.nStates, int))
        for episId in xrange(nEpisodes):
            state = s0
            for iterId in xrange(nSteps):

                # choose action
                if epsilon > np.random.rand(1):
                    action = np.where(cumActProb >= np.random.rand(1))[0][0]
                else:
                    action = policy[state]

                # sample reward and next state
                [reward,
                 nextState] = self.sampleRewardAndNextState(state, action)
                cum_rewards[episId] += (self.mdp.discount**iterId) * reward

                # update counts
                freq[action, state, nextState] += 1
                asFreq = freq[action, state, :].sum()

                # update transition
                T[action, state, :] = freq[action, state, :] / asFreq

                # update reward
                R[action,
                  state] = (reward + (asFreq - 1) * R[action, state]) / asFreq

                # update policy
                [policy, V, _] = model.policyIteration(policy)

                state = nextState
        return [V, policy, cum_rewards]
    def play_game(self):
        '''
        Simulate an actual game till the agent loses.
        
        '''
        # Create a MDP to track state transitions
        mdp = MDP.MDP()

        curr_state = mdp.discretize_state()
        last_state = mdp.discretize_state()
        last_action = 1
        # self.draw_gui(curr_state, last_state)
        reward = 0
        special_state_key = -1
        # Call simulate_one_time_step in a loop, until game fails(ball pass the paddle)
        counter = 0
        last_reward = 0
        curr_reward = mdp.read_curr_reward()
        while 1:

            last_reward = curr_reward

            # Select action and simulate time step
            action_selected = self.f_function(curr_state)
            # print "Action selected is " + self.action_strs[action_selected]
            mdp.simulate_one_time_step(action_selected)
            curr_state = mdp.discretize_state()
            #if curr_state[0] != last_state[0] or curr_state[1] != last_state[1]:
            #    self.draw_gui(curr_state, last_state)
            # print "the current positioin: ", curr_state[0], curr_state[1]
            # Update Q Table
            reward = mdp.read_curr_reward()
            if reward == 1:
                counter = counter + 1
                if counter > self.best_score:
                    self.best_score = counter
                    print "best", self.best_score

            if mdp.is_in_special_state():
                err = self.alpha_value * (reward + self.QTable[-1] -
                                          self.QTable[last_state +
                                                      (last_action, )])
                self.QTable[last_state + (
                    last_action, )] = self.QTable[last_state +
                                                  (last_action, )] + err
                # print "errrrrrrrrenddddddddd"
            else:
                err = self.alpha_value * (reward +
                                          self.QTable[curr_state +
                                                      (action_selected, )] -
                                          self.QTable[last_state +
                                                      (last_action, )])
                self.QTable[last_state + (
                    last_action, )] = self.QTable[last_state +
                                                  (last_action, )] + err

            if mdp.is_in_special_state():
                break

            last_state = curr_state
            last_action = action_selected
            curr_reward = mdp.read_curr_reward()
        pass
Exemplo n.º 27
0
"""
Created on Sun Oct  14 21:09:06 2018

@author: Victor Zuanazzi
"""

import random
from MDP import *
import matplotlib.pyplot as plt
import numpy as np

if __name__ == '__main__':
    mdp = MDP()
    mdp.print_MDP()
Exemplo n.º 28
0
# This file is responsible for handeling all user input
# The Graphical interface displays a 2-D view of the
# Rubik's cube as well as displaying the max Q-Value at
# each state in a comprehensive state map.

#######################################################
## How to use:                                       ##
## 1)
## 2)
## 3)
#######################################################
from tkinter import *
import random
import MDP, Test_Rubiks

rubiks_MDP = rubiks_MDP = MDP.MDP()

master = Tk()

# Canvas to represent 2x2 rubik's cube
w = Canvas(master, width=1000, height=460)
w.pack()

# Creates box where max Q-value to state will be plotted
w.create_line(600, 34, 600, 415, fill="#000000", width=3)
w.create_line(987, 34, 987, 415, fill="#000000", width=3)
w.create_line(600, 34, 987, 34, fill="#000000", width=3)
w.create_line(600, 415, 987, 415, fill="#000000", width=3)

# Label for Q-Value Map
w.create_text(790, 15, font=("Purisa", 16), text="Q-Values")
Exemplo n.º 29
0
def generate_clean_2d_maze(x_size=4,
                           y_size=3,
                           reward_coord=(3, 2),
                           start_method="random"):
    actions = {
        0: "increase_y",
        1: "increase_x",
        2: "decrease_y",
        3: "decrease_x"
    }
    coords2states = {}
    states2coords = {}
    map = dict()
    states_cnt = 0
    for x in range(x_size):
        for y in range(y_size):
            coords2states[(x, y)] = states_cnt
            states2coords[states_cnt] = (x, y)
            states_cnt += 1

    X = len(coords2states)
    U = len(actions)

    P = np.zeros(shape=(U, X, X))
    r = np.zeros(shape=(U, X, X))
    # This loops based on the state and action tells what are the next_state
    for x_origin in range(x_size):
        for y_origin in range(y_size):
            for u, u_str in actions.items():
                x_target = x_origin
                y_target = y_origin
                # doing the dynamics
                if u_str == "increase_y" and y_origin < y_size - 1:
                    y_target = y_origin + 1
                if u_str == "increase_x" and x_origin < x_size - 1:
                    x_target = x_origin + 1
                if u_str == "decrease_y" and y_origin > 0:
                    y_target = y_origin - 1
                if u_str == "decrease_x" and x_origin > 0:
                    x_target = x_origin - 1

                state_origin = coords2states[(x_origin, y_origin)]
                state_target = coords2states[(x_target, y_target)]

                P[u, state_origin, state_target] = 1.0
                if (x_origin, y_origin) == reward_coord:
                    r[u, state_origin, state_target] = 1.0
    # If we reached the reward, we randomly go to other places in the grid.
    for u in range(U):
        for y in range(X):
            P[u, coords2states[reward_coord], y] = 1 / X
    r_std = np.zeros_like(r)
    mdp = MDP.MDP(P,
                  r,
                  r_std,
                  info={
                      "coords2states": coords2states,
                      "states2coords": states2coords,
                      "actions": actions
                  })
    return mdp
Exemplo n.º 30
0
    if question_number == 2:
        # Creating a new agent
        new_agent = Agent(beta, gamma)
        # Display the simple "Up Policy"
        new_agent.simpleUpPolicyDisplay(nb_iterations)
    elif question_number == 3:
        # Creating a new agent
        new_agent = Agent(beta, gamma)
        # Creating a simple policy of going down:
        policy = new_agent.getSimplePolicy(DOWN)
        print("Displaying J{} for the simple policy of"
              " always going down:".format(nb_iterations))
        # Display the Jn for the {nb_iterations}th first iterations
        new_agent.computeJ(policy, nb_iterations)
    elif question_number == 4:
        new_mdp = MDP(beta, gamma)
        # Computing Q
        real_Q = new_mdp.getQ(nb_iterations)
        # Getting optimal policy
        opt_policy = new_mdp.getPolicyFromQ(real_Q)
        print("Printing J{} using the"
              " optimal policy:".format(format(nb_iterations)))
        new_agent = Agent(beta, gamma)
        # Corresponding JN
        J_opt = new_agent.computeJ(opt_policy, nb_iterations)

    elif question_number == 5:
        new_mdp_est = MDP(beta, gamma)
        new_mdp = MDP(beta, gamma)
        # Creating random history
        history = new_mdp_est.createHistory(t)