Пример #1
0
    def test_create_new_mdp_init_num_states_populates_state_list(self):

        mdp = MDP(5)

        self.assertIn(mdp.get_state(0), mdp.get_state_list())
        self.assertIn(mdp.get_state(2), mdp.get_state_list())
        self.assertIn(mdp.get_state(4), mdp.get_state_list())
Пример #2
0
class BellmanDPSolver(object):
    def __init__(self, discountRate=0.99):
        self.MDP = MDP()
        self._gamma = discountRate
        self.initVs()

    def initVs(self):
        self.v = dict()
        self.pi = dict()
        for s in self.MDP.S:
            self.v[s] = 0
            self.pi[s] = copy(self.MDP.A)

    def BellmanUpdate(self):
        for s in self.MDP.S:
            max_val = 0
            opt_act = []
            for a in self.MDP.A:
                probs = self.MDP.probNextStates(s, a)
                val = 0
                for a_prime, p in probs.items():
                    r = self.MDP.getRewards(s, a, a_prime)
                    val += p * (r + self._gamma * self.v[a_prime])
                if val > max_val:
                    max_val = val
                    opt_act = [a]
                elif val == max_val:
                    opt_act.append(a)
            self.v[s] = max_val
            self.pi[s] = opt_act
        return self.v, self.pi
 def __init__(self, discountRate):
     self.MDP = MDP()
     self.states = self.MDP.S
     self.Actions = self.MDP.A
     self.state_values = self.initVs()
     self.policy = {}
     self.gamma = discountRate
Пример #4
0
def getRandomPolicyValue():
    values = [0.0 for _ in range(10)]
    num = 1000000
    echoEpoch = 10000

    mdp = MDP()

    for k in range(1, num):
        for initState in range(1, 6):
            state = initState
            isTerminal = False
            gamma = 1.0
            value = 0.0

            while not isTerminal:
                action = mdp.randomAction()
                isTerminal, state, reward = mdp.transform(state, action)
                value += gamma * reward
                gamma *= mdp.gamma

            values[initState] += value

        if k % echoEpoch == 0:
            print('k = %d, Average values of state 1-5 are:\n' % k,
                  [value / k for value in values[1:6]])

    for i in range(len(values)):
        values[i] /= num

    return values
class BellmanDPSolver(object):
    def __init__(self, discountRate=1):
        self.MDP = MDP()
        self.gamma = discountRate
        self.initVs()

    def initVs(self):
        self.state_values = {pair: 0 for pair in self.MDP.S}

    def BellmanUpdate(self):
        prev_version = self.state_values.copy()
        for state in self.MDP.S:
            total_val = dict()
            for action in self.MDP.A:
                sub_total = 0
                for next_state, prob in self.MDP.probNextStates(state, action).items():
                    sub_total += prob*(self.MDP.getRewards(state, action, next_state)
                                       + self.gamma * prev_version.get(next_state))
                total_val[action] = sub_total
            self.state_values[state] = max(total_val.values())
        return self.state_values, self.compute_greedy_policy()

    def compute_greedy_policy(self):
        policy = dict()
        for state in self.MDP.S:
            q_sa = dict()
            for action in self.MDP.A:
                q_sa[action] = sum(prob*(self.MDP.getRewards(state, action, next_state) +
                                         self.gamma * self.state_values[next_state])
                                   for next_state, prob in self.MDP.probNextStates(state, action).items())
            policy[state] = [action for action in self.MDP.A if q_sa[action] == max(q_sa.values())]
        return policy
Пример #6
0
class BellmanDPSolver(object):
	def __init__(self, discountRate):
		self.MDP = MDP()
		self.discountRate = discountRate
		self.initVs()

	def initVs(self):
		self.values = {}
		self.policy = {}
		for state in self.MDP.S:
			self.values[state] = 0

	def BellmanUpdate(self):
		for state in self.MDP.S:
			self.policy[state] = []
			values_all = []
			for action in self.MDP.A:
				s_r_sum = 0
				prob_next_states = self.MDP.probNextStates(state,action)
				for state_2 in prob_next_states.keys():
					s_r_sum = s_r_sum + prob_next_states[state_2] * (self.MDP.getRewards(state,action,state_2)+self.discountRate*self.values[state_2])
				values_all.append(s_r_sum)
				
			self.values[state] = max(values_all)
			for i in range(len(values_all)):
				if values_all[i] == self.values[state]:
					self.policy[state].append(self.MDP.A[i])

		return (self.values,self.policy)

		raise NotImplementedError
Пример #7
0
class BellmanDPSolver(object):
	def __init__(self,discountRate):
		self.MDP = MDP()
		self.gamma = discountRate
		self.initVs()
	def initVs(self):
		self.values = {s: 0 for s in self.MDP.S}
		self.policy = {s: self.MDP.A for s in self.MDP.S}
	
	def BellmanUpdate(self):
		
		for s in self.MDP.S:
			best_v = -10**20
			best_a = []
			n_value = {a:0 for a in self.MDP.A}
			for a in self.MDP.A:
				
				for s_ in self.MDP.probNextStates(s,a).keys():
					n_value[a] += self.MDP.probNextStates(s,a)[s_] * (self.MDP.getRewards(s,a,s_) + self.gamma * self.values[s_])
				if n_value[a] > best_v:
					best_v = n_value[a]
			
			self.values[s] = best_v
			for a in self.MDP.A:
				if n_value[a] == best_v:
					best_a += [a]
			self.policy[s] = best_a
		return self.values, self.policy
class BellmanDPSolver(object):
    def __init__(self):
        self.MDP = MDP()
        self.initVs()

    def initVs(self):
        self.stateValueTable = {state: 0 for state in self.MDP.S}
        self.statePolicyTale = {state: self.MDP.A for state in self.MDP.S}

    def BellmanUpdate(self, discount_rate):
        for state in self.MDP.S:
            action_dict = {
                action: sum([
                    prob * (self.MDP.getRewards(state, action, nextState) +
                            discount_rate * self.stateValueTable[nextState])
                    for nextState, prob in self.MDP.probNextStates(
                        state, action).items()
                ])
                for action in self.MDP.A
            }

            self.stateValueTable[state] = max(action_dict.values())
            self.statePolicyTale[state] = [
                action for action, value in action_dict.items()
                if value == self.stateValueTable[state]
            ]
        return self.stateValueTable, self.statePolicyTale
Пример #9
0
    def test_get_non_existing_state(self):
        """
        Test that you can't get a non existing state
        """
        mdp = MDP(5)

        with self.assertRaises(IndexError):
            mdp.get_state(5)
Пример #10
0
    def test_create_state_populates_state_list(self):
        mdp = MDP()

        mdp.add_state(0)
        mdp.add_state(2)

        self.assertIn(mdp.get_state(0), mdp.get_state_list())
        self.assertIn(mdp.get_state(2), mdp.get_state_list())
Пример #11
0
    def test_create_new_mdp_no_initial_states(self):
        """
        I'm not sure what the create MDP method should actually do.
        """

        # there isn't very much we can tell about an mdp that is completely devoid
        # of states
        mdp = MDP()
        self.assertEqual(mdp.num_states(), 0)
Пример #12
0
    def test_get_action_list(self):
        mdp = MDP()

        mdp.add_action(0)
        mdp.add_action(1)

        action_list = mdp.get_action_list()
        self.assertEqual(len(action_list), 2)
        self.assertIn(mdp.get_action(0), action_list)
 def initialise_mdp(self, blocks):
     start_config = [-1,-1,-1]
     startingState = State(0, blocks, start_config)
     self.initialise_lists()
     self.success_config[-1].append(startingState)
     label = len(self.mdp_list[-1])
     mdp = MDP(label, blocks)
     mdp.statelist.append(startingState)
     mdp.initMDP(startingState)
     self.mdp_list[-1].append(mdp)
Пример #14
0
    def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001):

        MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon)

        self.goalVals = goalVals
        self.grid = grid

        self.setGridWorld()
        self.valueIteration()
        self.extractPolicy()
Пример #15
0
    def test_get_state_list(self):
        """
        It might be helpful to be able to get a list of all the states
        """

        mdp = MDP(5)
        state_list = mdp.get_state_list()

        self.assertEqual(len(state_list), 5)
        self.assertIn(mdp.get_state(0), state_list)
 def __init__(self, discountRate):
     self.MDP = MDP()
     self.states = MDP().S
     self.action = MDP().A
     self.discountRate = discountRate
     self.Values = {}
     self.Values['GOAL'] = 0
     self.Values['OUT'] = 0
     self.Policy = {}
     self.initVs()
	def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001):

		MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon)

		self.goalVals = goalVals
		self.grid = grid

		self.setGridWorld()
		self.valueIteration()
		self.extractPolicy()
	def __init__(self, discountRate):
		self.MDP = MDP()
		self.dr = discountRate
		self.S = [(x,y) for x in range(5) for y in range(5)]
		self.S.append("GOAL")
		self.S.append("OUT")
		self.A = ["DRIBBLE_UP","DRIBBLE_DOWN","DRIBBLE_LEFT","DRIBBLE_RIGHT","SHOOT"]
		self.oppositions = [(2,2), (4,2)]
		self.goalProbs = [[0.00,0.00,0.0,0.00,0.00],[0.0, 0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0],[0.0,0.3,0.5,0.3,0.0],[0.0,0.8,0.9,0.8,0.0]]
		self.vs ={}
		self.act = {}
Пример #19
0
 def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
     MDP.__init__(self, init, actlist=orientations, terminals=terminals, gamma=gamma)
     grid.reverse() ## because we want row 0 on bottom, not on top
     self.grid=grid
     self.rows=len(grid)
     self.cols=len(grid[0])
     for x in range(self.cols):
         for y in range(self.rows):
             self.reward[x, y] = grid[y][x]  # each reward is from the grid
             if grid[y][x] is not None:
                 self.states.add((x, y))     # each state is a tuple of indices
Пример #20
0
class BellmanDPSolver(object):
	def __init__(self, discountRate=0.9):
		self.MDP = MDP()
		self.discountRate = discountRate
		self.initVs()

	def initVs(self):		
		self.Vs = dict()
		self.policy = dict()
		for state in self.MDP.S:
			self.Vs[state] = 0
			self.policy[state] = self.MDP.A


	def action_return(self, state, action):		
		# for each next state:
		# get the state probability given current state and action
		# get the reward for the s, r, s' combination
		# sum the s, r, s' rewards by weighting them by their probability

		state_prob = self.MDP.probNextStates(state, action)

		expected_reward = 0
		for next_state in state_prob:
			prob = state_prob[next_state]
			reward = self.MDP.getRewards(state, action, next_state)
			expected_reward += prob * (reward + self.discountRate * self.Vs[next_state])

		return expected_reward


	def max_action_return(self, state):
		# finds actions with the heighest expected reward
		# and returns the action and its expected reward

		max_return = None
		best_actions = []
		for action in self.MDP.A:
			# get expected return for the action
			a_return = self.action_return(state, action)
			if max_return is None or max_return < a_return:
				max_return = a_return
				best_actions = [action]
			elif max_return == a_return:
				best_actions.append(action)

		return best_actions, max_return


	def BellmanUpdate(self):		
		for state in self.MDP.S:
			self.policy[state], self.Vs[state] = self.max_action_return(state)

		return self.Vs, self.policy
Пример #21
0
    def test_create_new_mdp_initial_num_states(self):
        """
        Test initializing MDPS with an explicity number of states
        """

        mdp = MDP(5)
        self.assertEqual(mdp.num_states(), 5)

        # this MDP should have 5 states
        self.assertEquals(type(mdp.get_state(0)), State)
        self.assertEquals(type(mdp.get_state(2)), State)
        self.assertEquals(type(mdp.get_state(4)), State)
Пример #22
0
def run_simulation(MDP, policy):
    print "Starting simulation for given MDP"

    while MDP.get_parked() == False:
        action = policy.choose_action(MDP.get_time())
        print "[TIME", MDP.get_time() ,"]:", policy.get_name(), "chose action", action
        MDP.take_action(action)
        print "[TIME", MDP.get_time() ,"]: Moved to state", MDP.get_state(), "Current reward %.3f." % MDP.get_reward()
    print "Exited in (spot, handicapped, available):", MDP.get_spot(), MDP.get_handicapped(), MDP.get_available()
Пример #23
0
def evaluate_policies(policy, MDP):
    total_reward, handicapped, crashed = 0,0,0
    num_sims = 10000
    for i in range(num_sims):
        run_simulation(MDP, policy)
        #maybe do something fancier
        total_reward += MDP.get_reward()
        if MDP.get_handicapped():
            handicapped += 1
        if not MDP.get_available():
            crashed += 1
        MDP.reset()
    print policy.get_name(), total_reward / num_sims, handicapped, crashed
Пример #24
0
def SARSA_lambda(process: MDP,
                 env: Environment,
                 lambda_: float = 0.7,
                 alpha: float = 0.01,
                 n_iter: int = 5000,
                 max_ep_len: int = 200):

    Q_value = np.zeros((process.nb_states, process.nb_actions))
    for i in range(1, n_iter + 1):

        epsilon = 1 / i

        # ---- Init Eligibility Trace ----
        E_t = np.zeros((process.nb_states, process.nb_actions))
        current_state = env.get_random_state()
        counter = 1
        ep_finished = False
        while not ep_finished:

            # ---- Update Policy ----
            policy = process.get_Q_policy(Q_value, epsilon)
            env.policy = policy

            # ---- MDP stepping ----
            current_action = env.generate_action(current_state)
            reward = env.generate_return(current_state, current_action)
            next_state = env.step(current_state, current_action)
            next_action = env.generate_action(next_state)

            # ---- Updating Eligibility Trace ----
            E_t[current_state.index, current_action.index] += 1
            E_t *= process.disc_fact * lambda_

            # ---- Updating Q_value function ----
            error = reward + process.disc_fact * Q_value[next_state.index, next_action.index] - \
                     Q_value[current_state.index, current_action.index]
            Q_value += alpha * error * E_t
            current_state = next_state
            current_action = next_action

            # ---- Stop Condition ----
            counter += 1
            if next_state.terminal:
                ep_finished = True
            if counter > max_ep_len:
                ep_finished = True

    return (process.get_Q_policy(Q_value))
Пример #25
0
def init():
    """
    Ask for a Gridworld file and initializes an MDP as environment and Q-learning object with it, then calls the menu.
    """
    print_headline("Gridworld Selection")
    gridworld = read_gridworld_file()

    environment = MDP(state_list=gridworld,
                      field_rewards=Default.FIELD_REWARDS,
                      obstacle_fields=Default.OBSTACLE_FIELDS,
                      actions=Default.ACTIONS,
                      transition_probabilities=Default.TRANSITION_PROBABILITIES)

    q_learning = QLearning(env_perform_action=environment.perform_action,
                           state_list=gridworld,
                           goal_fields=Default.GOAL_FIELDS,
                           obstacle_fields=Default.OBSTACLE_FIELDS,
                           actions=Default.ACTIONS,
                           discount_factor=Default.DISCOUNT_FACTOR,
                           learning_rate=Default.LEARNING_RATE,
                           epsilon=Default.EPSILON,
                           convergence_threshold=Default.CONVERGENCE_THRESHOLD)

    print("Your input Gridworld:")
    print_gridworld(gridworld)

    while show_menu(q_learning):
        pass

    print_headline("See you later")
Пример #26
0
def test():
    mdp = MDP(0.5)

    vFunc = MonteCarlo(mdp, *mdp.randomWalkSamples(100))

    print('Monte Carlo:')
    for i in range(1, 6):
        print('%d: %f\t' % (i, vFunc[i]), end='')
    print()

    vFunc = temporalDifference(mdp, 0.15, *mdp.randomWalkSamples(100))

    print('Temporal Difference:')
    for i in range(1, 6):
        print('%d: %f\t' % (i, vFunc[i]), end='')
    print()
Пример #27
0
    def test_create_state(self):
        """
        For implementation simplification I'm imagining creating all of the the states separately and then
        connecting them afterwards by specifying the actions.
        """

        # States should be abled to be identified by numbers or by strings I suppose.
        # I don't imagine that strings will ever be used.
        mdp = MDP()
        mdp.add_state(0)
        mdp.add_state(1)
        mdp.add_state(2)
        mdp.add_state(3)
        mdp.add_state(4)
        mdp.add_state(5, terminal=True)
        self.assertEqual(mdp.num_states(), 6)
Пример #28
0
def run_simulation(MDP, policy, horizon):
    print "Starting simulation for", MDP

    while MDP.get_time() > 0:
        action = policy.choose_action(MDP.get_time())
        print "[TIME", MDP.get_time(), "]:", policy.get_name(
        ), "chose action", action
        MDP.take_action(action)
        print "[TIME", MDP.get_time(), "]: Moved to state", MDP.get_state(
        ), "Current reward %.3f." % MDP.get_reward()
Пример #29
0
    def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
        MDP.__init__(self,
                     init,
                     actlist=orientations,
                     terminals=terminals,
                     gamma=gamma)

        self.grid = grid
        self.rows = len(grid)
        self.cols = len(grid[0])

        # print(self.rows,self.cols)

        for x in range(self.cols):
            for y in range(self.rows):
                self.reward[y, x] = grid[y][x]
                if self.state_check((y, x)):
                    self.states.add((y, x))
Пример #30
0
def main():
    if len(sys.argv) < 5:
        print >> sys.stderr, "Usage: Simulator.py\t<MDP.txt>\t<RandomPolicy|OptimalPolicy>\t<Epsilon>\t<discount>\t<alpha>\t<training>"
        #sys.exit(-1)
    else:
        filename = sys.argv[1]
        training = float(sys.argv[6])
        alpha = float(sys.argv[5])
        discount = float(sys.argv[4])
        epsilon = float(sys.argv[3])
        user_policy = sys.argv[2]


    filename, epsilon, discount, alpha, training, user_policy = "example_1.mdp", .5, .9, .3, 200, "OptimalPolicy"

    transition_p, rewards, nStates, nActions = ReadMDP(filename)
    states = range(nStates)
    actions = range(nActions)
    initial_state = 0
    transition_function = Transition(transition_p)
    MyMDP = MDP(states, actions, transition_function, rewards, initial_state)
    policy = ValueIterationPolicy(MyMDP, user_policy , epsilon, discount)
    policy.display_policy()
    print ""
    policy.display_value_f()
    #print policy.bellman_backup(initial_state, 10)
    MyMDP = MDP(states, actions, transition_function, rewards, initial_state)
    policy = RandomPolicy(MyMDP, "RandomPolicy")
    evaluate_policies(policy, MyMDP)
    #run_simulation(MyMDP, policy)

    MyMDP = MDP(states, actions, transition_function, rewards, initial_state)
    policy = GreedyPolicy(MyMDP, .3)
    #evaluate_policies(policy, MyMDP)

    #run_simulation(MyMDP, policy)

    MyMDP = MDP(states, actions, transition_function, rewards, initial_state)
    policy = ImpatientPolicy(MyMDP)
#    evaluate_policies(policy, MyMDP)

    #run_simulation(MyMDP, policy)

    MyMDP = MDP(states, actions, transition_function, rewards, initial_state)
    policy = NoHandicapPolicy(MyMDP, .3)
   # evaluate_policies(policy, MyMDP)

    #run_simulation(MyMDP, policy)

    MyMDP = MDP(states, actions, transition_function, rewards, initial_state)
    policy = QLearningPolicy(MyMDP, alpha, discount)
    run_training(MyMDP, policy, training)
    MyMDP.reset()
    evaluate_policies(policy, MyMDP)
Пример #31
0
def run_simulation(MDP, policy, epsilon):
    print "Starting simulation for", MDP

    while MDP.get_time() < epsilon:
        action = policy.choose_action(MDP.get_time())
        print "[TIME", MDP.get_time() ,"]:", policy.get_name(), "chose action", action
        MDP.take_action(action)
        print "[TIME", MDP.get_time() ,"]: Moved to state", MDP.get_state(), "Current reward %.3f." % MDP.get_reward()
class BellmanDPSolver(object):
    def __init__(self, discountRate):
        self.MDP = MDP()
        self.states = self.MDP.S
        self.Actions = self.MDP.A
        self.state_values = self.initVs()
        self.policy = {}
        self.gamma = discountRate

    def initVs(self):
        state_values = {}
        for state in self.states:
            state_values[state] = 0.0
        return state_values

    def one_step_ahead(self, state):
        """
		Function that calculates the value for all actions in a given state
		Args: state to be considered
		Returns: A dictionary with keys the actions that can be taken and as values the expected value of each action 
		"""
        action_values = {}
        for action in self.Actions:
            transition_prob = self.MDP.probNextStates(state, action)
            total = 0.0
            for next_state, probability in transition_prob.items():
                reward = self.MDP.getRewards(state, action, next_state)
                total += probability * (
                    reward + self.gamma * self.state_values[next_state])
            action_values[action] = total
        return action_values

    def BellmanUpdate(self):
        for state in self.states:
            Action_values = self.one_step_ahead(state)
            max_value = max(Action_values.values())
            self.state_values[state] = max_value
            actions = []
            for action in self.Actions:
                if Action_values[action] == max_value:
                    actions.append(action)
            self.policy[state] = actions
        return (self.state_values, self.policy)
class BellmanDPSolver(object):
	def __init__(self,discount_rate):
		self.mpd = MDP()
		self.actions = self.mpd.A
		self.gamma = discount_rate
		self.policy = {}
		self.current_position = -1
	
	def initVs(self):
		self.values = {(1, 3): 0, (3, 0): 0, (2, 1): 0, (0, 3): 0, (4, 0): 0, (1, 2): 0, (3, 3): 0, (4, 4): 0, (2, 2): 0, (4, 1): 0,
		               (1, 1): 0, 'OUT': 0, (3, 2): 0, (0, 0): 0, (0, 4): 0, (1, 4): 0, (2, 3): 0, (4, 2): 0, (1, 0): 0, (0, 1): 0,
		               'GOAL': 0, (3, 1): 0, (2, 4): 0, (2, 0): 0, (4, 3): 0, (3, 4): 0, (0, 2): 0}
		self.policy = {(1, 3): [], (3, 0): [], (2, 1): [], (0, 3): [], (4, 0): [], (1, 2): [], (3, 3): [], (4, 4): [], (2, 2): [],
		                (4, 1): [], (1, 1): [], 'OUT': [],  (3, 2): [], (0, 0): [], (0, 4): [], (1, 4): [], (2, 3): [], (4, 2): [],
		                (1, 0): [], (0, 1): [], 'GOAL': [], (3, 1): [], (2, 4): [], (2, 0): [], (4, 3): [], (3, 4): [], (0, 2): []}

	
	def BellmanUpdate(self):
		for init_state, value_f in self.values.items():
			max=None
			for action in self.actions:
				temp = 0
				# Transition Table p(s',r | s,a)
				next_states = self.mpd.probNextStates(init_state,action)
				for new_state,prob in next_states.items():
					temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state])
				if max == None or temp >= max:
					max = temp
			self.values[init_state]= max
			
			# Greedily compute new policy
			policy_list = []
			max = self.values[init_state]
			for action in self.actions:
				temp = 0
				next_states = self.mpd.probNextStates(init_state,action)
				for new_state,prob in next_states.items():
					temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state])
				if temp == max:
					policy_list.append(action)
			self.policy[init_state]= policy_list
			
		return self.values,self.policy
Пример #34
0
	def fullMDP( self ):
		rewards = np.zeros( self.nstates * self.nactions )
		for i in range( self.nrewardfactors ):
			rewards = rewards + self.rewardstruct[i].mapping @ self.rewardstruct[i].params

		kernel = np.ones( ( self.nstates * self.nactions, self.nstates ) )
		for i in range( self.nstatefactors ):
			kernel = kernel * ( self.transitionstruct[i].mapping @ self.transitionstruct[i].params @ self.statemappings[i] )

		return MDP( self.nstates, self.nactions, rewards / self.nrewardfactors, kernel )
class BellmanDPSolver(object):
    def __init__(self, discountRate):
        self.MDP = MDP()
        self.states = MDP().S
        self.action = MDP().A
        self.discountRate = discountRate
        self.Values = {}
        self.Values['GOAL'] = 0
        self.Values['OUT'] = 0
        self.Policy = {}
        self.initVs()

    def initVs(self):
        for x in range(5):
            for y in range(5):
                self.Values[(x, y)] = 0
                self.Policy[(x, y)] = [
                    "DRIBBLE_UP", "DRIBBLE_DOWN", "DRIBBLE_LEFT",
                    "DRIBBLE_RIGHT", "SHOOT"
                ]

    def BellmanUpdate(self):
        values = {}
        for i in range(5):
            for j in range(5):
                values[(i, j)] = -np.inf
                for action in self.action:
                    tmp_values = 0.0
                    nextstateprob = self.MDP.probNextStates((i, j), action)
                    for nextstate in nextstateprob.keys():
                        reward = self.MDP.getRewards((i, j), action, nextstate)
                        prob = nextstateprob[nextstate]
                        tmp_values = tmp_values + (
                            prob *
                            (reward +
                             self.discountRate * self.Values[nextstate]))
                    if values[(i, j)] < tmp_values:
                        values[(i, j)] = tmp_values
                        self.Policy[(i, j)] = [action]
                    elif values[(i, j)] == tmp_values:
                        self.Policy[(i, j)].append(action)
                self.Values[(i, j)] = values[(i, j)]
        return self.Values, self.Policy
Пример #36
0
class BellmanDPSolver(object):
    def __init__(self, discount=0.9, theta=1e-4):
        self.MDP = MDP()
        self.discount = discount
        self.theta = theta
        self.initval, self.policy = self.initVs()

    def initVs(self):
        initval = {}
        policy = {}
        L1 = self.MDP.S
        for i in L1:
            initval[i] = 0
            # all the action
            policy[i] = self.MDP.A

        return initval, policy

    def BellmanUpdate(self):
        for states in self.MDP.S:
            nextV = {}
            for action in self.MDP.A:
                nextStateProb = self.MDP.probNextStates(states, action)

                value = 0
                for nextsta in nextStateProb:
                    immr = self.MDP.getRewards(states, action, nextsta)

                    value += nextStateProb[nextsta] * (
                        immr + self.discount * self.initval[nextsta])

                nextV[action] = value

            self.initval[states] = max(nextV.values())
            # select the corresponding optimal action and fill in the policy dic
            self.policy[states] = [
                key for key, value in nextV.items()
                if value == max(nextV.values())
            ]

        return self.initval, self.policy
Пример #37
0
def start_grid_mdp():
    """
    starts the program, restarts if the user wants to
    """
    grid = load_grid(get_file_path())
    world = GridWorld(grid)
    move_costs = get_move_cost()
    gamma = get_gamma()
    eval_steps = get_evaluation_steps()
    MDP(world, eval_steps, gamma, move_costs)
    if start_again():
        start_grid_mdp()
Пример #38
0
    def test_add_action(self):
        """
        Test that you can add an action. Named actions make more sense than named states
        """
        mdp = MDP()

        mdp.add_action(0)
        self.assertEqual(mdp.num_actions(), 1)
        self.assertEqual(type(mdp.get_action(0)), Action)
        self.assertIn(mdp.get_action(0), mdp.get_action_list())
Пример #39
0
 def initialise_mdp(self, state):
     try:
         blocks = []
         for prop in state.initial_state.block_properties:
             blocks.append(Block(prop.label, prop.shape, prop.colour, prop.size))
         start_config = state.initial_state.configuration.config
         startingState = State(0, start_config)
         self.initialise_lists()
         self.success_config[-1].append(startingState)
         label = len(self.mdp_list[-1])
         print ""
         print label
         print ""
         mdp = MDP(label, blocks)
         mdp.statelist.append(startingState)
         mdp.initMDP(startingState)
         self.mdp_list[-1].append(mdp)
         print "MDP initialised"
         return True
     except:
         return False
Пример #40
0
def GLIE(process: MDP,
         env: Environment,
         n_iter: int = 5000,
         eps: float = 0.01):

    Q_value = np.zeros((process.nb_states, process.nb_actions))
    count_state_action = np.zeros((process.nb_states, process.nb_actions))
    for i in range(1, n_iter + 1):
        epsilon = 1 / i
        policy = process.get_Q_policy(Q_value, epsilon)
        env.policy = policy
        states, actions, returns = env.generate_episode()
        G = 0
        for j in range(len(returns) - 1, 0, -1):
            G = process.disc_fact * G + returns[j]
            current_state = states[j]
            current_action = actions[j]
            count_state_action[current_state.index, current_action.index] += 1
            Q_value[current_state.index, current_action.index] += \
            (G - Q_value[current_state.index, current_action.index]) / count_state_action[current_state.index, current_action.index]
    return process.get_Q_policy(Q_value)
Пример #41
0
    def test_mdp_size(self):
        """
        Probably nice to be able to tell the size of the MDP. If not only for tests
        """
        mdp = MDP()
        self.assertEqual(mdp.num_states(), 0)

        mdp = MDP(5)
        self.assertEqual(mdp.num_states(), 5)

        mdp = MDP()
        mdp.add_state(0)
        mdp.add_state(1)
        self.assertEqual(mdp.num_states(), 2)
Пример #42
0
def Q_learning(process: MDP,
               env: Environment,
               lambda_: float = 0.7,
               alpha: float = 0.01,
               n_iter: int = 5000,
               max_ep_len: int = 200):

    Q_value = np.zeros((process.nb_states, process.nb_actions))
    for i in range(1, n_iter + 1):

        epsilon = 1 / i
        current_state = env.get_random_state()
        counter = 1
        ep_finished = False
        while not ep_finished:

            # ---- Update Policy ----
            policy = process.get_Q_policy(Q_value, epsilon)
            env.policy = policy

            # ---- MDP stepping ----
            current_action = env.generate_action(current_state)
            reward = env.generate_return(current_state, current_action)
            next_state = env.step(current_state, current_action)

            # ---- Updating Q_value function ----

            Q_value[current_state.index, current_action.index] += alpha * (reward + \
                   process.disc_fact * Q_value[next_state.index,:].max() - \
                   Q_value[current_state.index, current_action.index])
            current_state = next_state

            # ---- Stop Condition ----
            counter += 1
            if next_state.terminal:
                ep_finished = True
            if counter > max_ep_len:
                ep_finished = True

    return (process.get_Q_policy(Q_value))
Пример #43
0
    def test_add_transition(self):
        """
        S, A, S', P
        Takes in a state, one of the actions, the state that we'll end up in after taking an action, and the probability
        that this transition occurs.
        """

        mdp = MDP(5)
        mdp.add_action(0)

        s = mdp.get_state(0)
        a = mdp.get_action(0)
        s_prime = mdp.get_state(1)

        mdp.add_transition(s, a, s_prime, 1.0)

        transition_function = mdp.get_transition_function()
        self.assertEqual(s.id, a.id)
class BellmanDPSolver(object):
	def __init__(self, discountRate):
		self.MDP = MDP()
		self.dr = discountRate
		self.S = [(x,y) for x in range(5) for y in range(5)]
		self.S.append("GOAL")
		self.S.append("OUT")
		self.A = ["DRIBBLE_UP","DRIBBLE_DOWN","DRIBBLE_LEFT","DRIBBLE_RIGHT","SHOOT"]
		self.oppositions = [(2,2), (4,2)]
		self.goalProbs = [[0.00,0.00,0.0,0.00,0.00],[0.0, 0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0],[0.0,0.3,0.5,0.3,0.0],[0.0,0.8,0.9,0.8,0.0]]
		self.vs ={}
		self.act = {}


	def initVs(self):
		for s in self.S:
			self.vs[s] = 0.0

	def BellmanUpdate(self):
		for s in self.S:
			temp = -100000000
			action = " "
			value_max= []
			action_max = []
			for a in self.A:
				nextState = self.MDP.probNextStates(s,a)
				value_a = 0.0
				for s_next,prob in nextState.items():
					reward = self.MDP.getRewards(s,a,s_next)
					value_next = self.vs[s_next]
					value_a += prob*(reward + self.dr*value_next)
				value_max.append(value_a)
			self.vs[s]= np.max(value_max)
			for i in range(len(self.A)):
				if value_max[i] == np.max(value_max):
					action_max.append(self.A[i])			
			self.act[s] = action_max
		return self.vs,self.act
Пример #45
0
def valueIterationActions(MDP, gamma, delta):
    """
    The value iteration algorithm calculates the utility of each state in the MDP. This
    utility is then used to determine which action a robot should take, given it's current
    state and possible actions from that state. Once the change in utility is less than
    delta, the while loop terminates
    """

    # this will hold all utility information
    U = util.Counter()
    allActions = MDP.get_actions()
    # when this term become true, the while loop terminates
    keep_iterating = True
    while keep_iterating:
        maxDeltU = 0
        # loop through each state on each iteration
        for mdpState in MDP.states:
            # will store the maximum utility out of all possible actions
            maxVal = 0

            for action in allActions[mdpState]:
                # will sum the utilities over all possible states from that action
                total = 0
                for (nextState, prob) in MDP.transModel[(mdpState, action)].items():
                    total += prob * (MDP.calc_rewards(mdpState, nextState) + gamma * U[nextState])
                maxVal = max(maxVal, total)
            Uprev = U[mdpState]

            # update the utility for this state
            U[mdpState] = maxVal

            # if the utility value changes by less than delta, then stop iterating
            deltU = abs(Uprev - U[mdpState])
            maxDeltU = max(maxDeltU, deltU)
        if maxDeltU < delta:
            keep_iterating = False
    return U
Пример #46
0
    def test_get_state(self):
        """
        Should probably be some nice way to get states by a state id or state name
        """

        # test that you can get a state by numerical id
        mdp = MDP()
        mdp.add_state(0)
        self.assertEquals(type(mdp.get_state(0)), State)
        self.assertIn(mdp.get_state(0), mdp.get_state_list())
def get_lily_pads_mdp(n: int) -> MDP:
    data = {
        i: {
            'A': ({
                i - 1: i / n,
                i + 1: 1. - i / n
            }, 1 / n if i == n - 1 else 0.),
            'B': ({j: 1 / n
                   for j in range(n + 1) if j != i}, 1 / n)
        }
        for i in range(1, n)
    }
    data[0] = {'A': ({0: 1.}, 0.), 'B': ({0: 1.}, 0.)}
    data[n] = {'A': ({n: 1.}, 0.), 'B': ({n: 1.}, 0.)}

    gamma = 1.0
    return MDP(data, gamma)
Пример #48
0
    def generateMDPs(self):
        self.mdps = []
        results = []

        p = Pool(processes=(self.num_agents + 2))

        for i in xrange(0, self.num_agents):
            print "Generating MDP for Agent" + str(i)
            a = MDP(i, self.config)
            self.mdps.append(a)

        res = p.amap(self._instance_method_alias_call, self.mdps)
        self.mdps = res.get()

        sum = 0
        for m in self.mdps:
            sum += m.numberVariables
        print "Total Number of Variables: ", sum
Пример #49
0
 def test_3states_1action_init2(self):
     self.assertMDP(
         MDP(transition=[{
             "a1": {
                 1: 1
             }
         }, {
             "a1": {
                 2: 1
             }
         }, {
             "a1": {
                 0: .2,
                 1: .8
             }
         }],
             rewards=[{
                 "a1": {}
             }, {
                 "a1": {}
             }, {
                 "a1": {
                     0: 10
                 }
             }],
             gamma=0.9,
             default_reward=0),
         {
             0: 2.891,
             1: 4.018,
             2: 4.804
         },
         {
             0: 'a1',
             1: 'a1',
             2: 'a1'
         },
         init={
             0: -7,
             2: -9
         },  # test with dict init (+ bad init values)
         max_rounds=10,
         places=3)
Пример #50
0
def runValueIteration(world, robot1, robot2):
    """
    This function creates robot states, actions, a transition model, and
    a gamma. Then, using these models, it creates 2 robot classes. Then,
    It develops an MDP for the two robots. Finally, it runs a value iteration
    algorithm on the MDP to develop a utility function for each MDP state
    :param world:
    :param robot1:
    :param robot2:
    :param gamma:
    :param delta:
    :return:
    """
    # MDP object
    mdp = MDP(world, robot1, robot2)

    # utility function for the MDP
    U = valueIteration(mdp, world.gamma, world.delta)

    return U
Пример #51
0
def main():
    if len(sys.argv) < 4:
        print >> sys.stderr, "Usage: Simulator.py\t<MDP.txt>\t<RandomPolicy|OptimalPolicy>\t<Horizon>"
        sys.exit(-1)

    filename = sys.argv[1]

    transition_p, rewards, nStates, nActions = ReadMDP(filename)
    horizon = int(sys.argv[3])
    states = range(nStates)
    actions = range(nActions)
    initial_state = 0
    user_policy = sys.argv[2]
    transition_function = Transition(transition_p)
    MyMDP = MDP(states, actions, transition_function, rewards, initial_state,
                horizon)
    policy = ValueIterationPolicy(MyMDP, user_policy, horizon)
    policy.display_policy()
    print ""
    policy.display_value_f()
Пример #52
0
def run_training(MDP, policy, horizon):
    t = 0
    trajectory = []
    while t < horizon:
        if MDP.get_parked():
            # we need to make it do one more update.
            action = policy.choose_training_action()
            state = MDP.get_state()
            trajectory.append((state, action, MDP.get_state()))
            #reset our simulator
            MDP.reset()
            policy.q_updates(trajectory)
            trajectory = []
        else:
            #record trajectory
            action = policy.choose_training_action()
            state = MDP.get_state()
            policy.take_action(action)
            trajectory.append((state, action, MDP.get_state()))
        t += 1
Пример #53
0
 def test_add_string_action(self):
     mdp = MDP()
     mdp.add_action("jump")
     self.assertEqual(mdp.num_actions(), 1)
     self.assertEqual(type(mdp.get_action("jump")), Action)
     self.assertIn(mdp.get_action("jump"), mdp.get_action_list())
Пример #54
0
def run_simulation(MDP, policy):
    #print "Starting simulation for given MDP"
    while not MDP.get_parked():
        action = policy.choose_action(MDP.get_time())
        #print "[TIME", MDP.get_time() ,"]:", policy.get_name(), "chose action", action
        policy.take_action(action)
Пример #55
0
    def test_get_num_actions(self):
        """
        It will be helpful to be able to return the number of distinct actions that an MDP
        has.
        """
        mdp = MDP()
        mdp.add_action(0)
        mdp.add_action(1)
        mdp.add_action(2)
        mdp.add_action(3)
        mdp.add_action(4)
        mdp.add_action(5)
        mdp.add_action(6)

        action_list = mdp.get_action_list()
        self.assertEqual(len(action_list), 7)
        self.assertIn(mdp.get_action(0), action_list)
Пример #56
0
                    value_fcn = pickle.load(handle)
                '''

                #set up grid world mdp
                '''
                grid_mdp = GridWorldMDP(map_struct['seed_map'], map_struct['goal'])
                '''
                grid_mdp = GridWorldMDP(map_struct['seed_map'], map_struct['goal'], 
                    map_struct['start'], map_struct['bridge_probabilities'], 
                    map_struct['bridge_locations'])

                init_value = {}
                for s in grid_mdp.states:
                    init_value[s.tostring()] = np.linalg.norm(s - grid_mdp.goal_state)

                mdp = MDP(grid_mdp.states, grid_mdp.valid_actions_function, grid_mdp.cost_function)
                #value_fcn = mdp.value_iteration(value = value_fcn, plot=True, world_size = 50)
                value_fcn = mdp.value_iteration(value = init_value, plot=True, world_size = 50)

                #set up dubins astar
                dub = dubins_astar(world_points, value_fcn)
                astar = AStar(motion_primitives, dub.cost_function, dub.heuristic,
                    dub.valid_edge, dub.state_equality, plot = False)

                astar_state = np.array([state['x'],state['y'],state['theta']])
            else:
                '''
                following_dist = 0.0
                temp_idx = dub.last_idx
                while following_dist < dub.look_ahead_dist
                    temp_idx -= 1
Пример #57
0
    def test_get_non_existent_state(self):
        mdp = MDP(5)

        with self.assertRaises(IndexError):
            self.assertRaises(mdp.get_state(11), IndexError)
Пример #58
0
    def test_add_duplicate_state(self):
        mdp = MDP()

        mdp.add_state(0)
        with self.assertRaises(KeyError):
            mdp.add_state(0)
Пример #59
0
 def test_add_duplicate_action(self):
     mdp = MDP()
     mdp.add_action("jump")
     with self.assertRaises(KeyError):  # RunTime error is probably more appropriate?
         mdp.add_action("jump")