def test_create_new_mdp_init_num_states_populates_state_list(self): mdp = MDP(5) self.assertIn(mdp.get_state(0), mdp.get_state_list()) self.assertIn(mdp.get_state(2), mdp.get_state_list()) self.assertIn(mdp.get_state(4), mdp.get_state_list())
class BellmanDPSolver(object): def __init__(self, discountRate=0.99): self.MDP = MDP() self._gamma = discountRate self.initVs() def initVs(self): self.v = dict() self.pi = dict() for s in self.MDP.S: self.v[s] = 0 self.pi[s] = copy(self.MDP.A) def BellmanUpdate(self): for s in self.MDP.S: max_val = 0 opt_act = [] for a in self.MDP.A: probs = self.MDP.probNextStates(s, a) val = 0 for a_prime, p in probs.items(): r = self.MDP.getRewards(s, a, a_prime) val += p * (r + self._gamma * self.v[a_prime]) if val > max_val: max_val = val opt_act = [a] elif val == max_val: opt_act.append(a) self.v[s] = max_val self.pi[s] = opt_act return self.v, self.pi
def __init__(self, discountRate): self.MDP = MDP() self.states = self.MDP.S self.Actions = self.MDP.A self.state_values = self.initVs() self.policy = {} self.gamma = discountRate
def getRandomPolicyValue(): values = [0.0 for _ in range(10)] num = 1000000 echoEpoch = 10000 mdp = MDP() for k in range(1, num): for initState in range(1, 6): state = initState isTerminal = False gamma = 1.0 value = 0.0 while not isTerminal: action = mdp.randomAction() isTerminal, state, reward = mdp.transform(state, action) value += gamma * reward gamma *= mdp.gamma values[initState] += value if k % echoEpoch == 0: print('k = %d, Average values of state 1-5 are:\n' % k, [value / k for value in values[1:6]]) for i in range(len(values)): values[i] /= num return values
class BellmanDPSolver(object): def __init__(self, discountRate=1): self.MDP = MDP() self.gamma = discountRate self.initVs() def initVs(self): self.state_values = {pair: 0 for pair in self.MDP.S} def BellmanUpdate(self): prev_version = self.state_values.copy() for state in self.MDP.S: total_val = dict() for action in self.MDP.A: sub_total = 0 for next_state, prob in self.MDP.probNextStates(state, action).items(): sub_total += prob*(self.MDP.getRewards(state, action, next_state) + self.gamma * prev_version.get(next_state)) total_val[action] = sub_total self.state_values[state] = max(total_val.values()) return self.state_values, self.compute_greedy_policy() def compute_greedy_policy(self): policy = dict() for state in self.MDP.S: q_sa = dict() for action in self.MDP.A: q_sa[action] = sum(prob*(self.MDP.getRewards(state, action, next_state) + self.gamma * self.state_values[next_state]) for next_state, prob in self.MDP.probNextStates(state, action).items()) policy[state] = [action for action in self.MDP.A if q_sa[action] == max(q_sa.values())] return policy
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.discountRate = discountRate self.initVs() def initVs(self): self.values = {} self.policy = {} for state in self.MDP.S: self.values[state] = 0 def BellmanUpdate(self): for state in self.MDP.S: self.policy[state] = [] values_all = [] for action in self.MDP.A: s_r_sum = 0 prob_next_states = self.MDP.probNextStates(state,action) for state_2 in prob_next_states.keys(): s_r_sum = s_r_sum + prob_next_states[state_2] * (self.MDP.getRewards(state,action,state_2)+self.discountRate*self.values[state_2]) values_all.append(s_r_sum) self.values[state] = max(values_all) for i in range(len(values_all)): if values_all[i] == self.values[state]: self.policy[state].append(self.MDP.A[i]) return (self.values,self.policy) raise NotImplementedError
class BellmanDPSolver(object): def __init__(self,discountRate): self.MDP = MDP() self.gamma = discountRate self.initVs() def initVs(self): self.values = {s: 0 for s in self.MDP.S} self.policy = {s: self.MDP.A for s in self.MDP.S} def BellmanUpdate(self): for s in self.MDP.S: best_v = -10**20 best_a = [] n_value = {a:0 for a in self.MDP.A} for a in self.MDP.A: for s_ in self.MDP.probNextStates(s,a).keys(): n_value[a] += self.MDP.probNextStates(s,a)[s_] * (self.MDP.getRewards(s,a,s_) + self.gamma * self.values[s_]) if n_value[a] > best_v: best_v = n_value[a] self.values[s] = best_v for a in self.MDP.A: if n_value[a] == best_v: best_a += [a] self.policy[s] = best_a return self.values, self.policy
class BellmanDPSolver(object): def __init__(self): self.MDP = MDP() self.initVs() def initVs(self): self.stateValueTable = {state: 0 for state in self.MDP.S} self.statePolicyTale = {state: self.MDP.A for state in self.MDP.S} def BellmanUpdate(self, discount_rate): for state in self.MDP.S: action_dict = { action: sum([ prob * (self.MDP.getRewards(state, action, nextState) + discount_rate * self.stateValueTable[nextState]) for nextState, prob in self.MDP.probNextStates( state, action).items() ]) for action in self.MDP.A } self.stateValueTable[state] = max(action_dict.values()) self.statePolicyTale[state] = [ action for action, value in action_dict.items() if value == self.stateValueTable[state] ] return self.stateValueTable, self.statePolicyTale
def test_get_non_existing_state(self): """ Test that you can't get a non existing state """ mdp = MDP(5) with self.assertRaises(IndexError): mdp.get_state(5)
def test_create_state_populates_state_list(self): mdp = MDP() mdp.add_state(0) mdp.add_state(2) self.assertIn(mdp.get_state(0), mdp.get_state_list()) self.assertIn(mdp.get_state(2), mdp.get_state_list())
def test_create_new_mdp_no_initial_states(self): """ I'm not sure what the create MDP method should actually do. """ # there isn't very much we can tell about an mdp that is completely devoid # of states mdp = MDP() self.assertEqual(mdp.num_states(), 0)
def test_get_action_list(self): mdp = MDP() mdp.add_action(0) mdp.add_action(1) action_list = mdp.get_action_list() self.assertEqual(len(action_list), 2) self.assertIn(mdp.get_action(0), action_list)
def initialise_mdp(self, blocks): start_config = [-1,-1,-1] startingState = State(0, blocks, start_config) self.initialise_lists() self.success_config[-1].append(startingState) label = len(self.mdp_list[-1]) mdp = MDP(label, blocks) mdp.statelist.append(startingState) mdp.initMDP(startingState) self.mdp_list[-1].append(mdp)
def __init__(self, grid, goalVals, discount=.99, tau=.01, epsilon=.001): MDP.__init__(self, discount=discount, tau=tau, epsilon=epsilon) self.goalVals = goalVals self.grid = grid self.setGridWorld() self.valueIteration() self.extractPolicy()
def test_get_state_list(self): """ It might be helpful to be able to get a list of all the states """ mdp = MDP(5) state_list = mdp.get_state_list() self.assertEqual(len(state_list), 5) self.assertIn(mdp.get_state(0), state_list)
def __init__(self, discountRate): self.MDP = MDP() self.states = MDP().S self.action = MDP().A self.discountRate = discountRate self.Values = {} self.Values['GOAL'] = 0 self.Values['OUT'] = 0 self.Policy = {} self.initVs()
def __init__(self, discountRate): self.MDP = MDP() self.dr = discountRate self.S = [(x,y) for x in range(5) for y in range(5)] self.S.append("GOAL") self.S.append("OUT") self.A = ["DRIBBLE_UP","DRIBBLE_DOWN","DRIBBLE_LEFT","DRIBBLE_RIGHT","SHOOT"] self.oppositions = [(2,2), (4,2)] self.goalProbs = [[0.00,0.00,0.0,0.00,0.00],[0.0, 0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0],[0.0,0.3,0.5,0.3,0.0],[0.0,0.8,0.9,0.8,0.0]] self.vs ={} self.act = {}
def __init__(self, grid, terminals, init=(0, 0), gamma=.9): MDP.__init__(self, init, actlist=orientations, terminals=terminals, gamma=gamma) grid.reverse() ## because we want row 0 on bottom, not on top self.grid=grid self.rows=len(grid) self.cols=len(grid[0]) for x in range(self.cols): for y in range(self.rows): self.reward[x, y] = grid[y][x] # each reward is from the grid if grid[y][x] is not None: self.states.add((x, y)) # each state is a tuple of indices
class BellmanDPSolver(object): def __init__(self, discountRate=0.9): self.MDP = MDP() self.discountRate = discountRate self.initVs() def initVs(self): self.Vs = dict() self.policy = dict() for state in self.MDP.S: self.Vs[state] = 0 self.policy[state] = self.MDP.A def action_return(self, state, action): # for each next state: # get the state probability given current state and action # get the reward for the s, r, s' combination # sum the s, r, s' rewards by weighting them by their probability state_prob = self.MDP.probNextStates(state, action) expected_reward = 0 for next_state in state_prob: prob = state_prob[next_state] reward = self.MDP.getRewards(state, action, next_state) expected_reward += prob * (reward + self.discountRate * self.Vs[next_state]) return expected_reward def max_action_return(self, state): # finds actions with the heighest expected reward # and returns the action and its expected reward max_return = None best_actions = [] for action in self.MDP.A: # get expected return for the action a_return = self.action_return(state, action) if max_return is None or max_return < a_return: max_return = a_return best_actions = [action] elif max_return == a_return: best_actions.append(action) return best_actions, max_return def BellmanUpdate(self): for state in self.MDP.S: self.policy[state], self.Vs[state] = self.max_action_return(state) return self.Vs, self.policy
def test_create_new_mdp_initial_num_states(self): """ Test initializing MDPS with an explicity number of states """ mdp = MDP(5) self.assertEqual(mdp.num_states(), 5) # this MDP should have 5 states self.assertEquals(type(mdp.get_state(0)), State) self.assertEquals(type(mdp.get_state(2)), State) self.assertEquals(type(mdp.get_state(4)), State)
def run_simulation(MDP, policy): print "Starting simulation for given MDP" while MDP.get_parked() == False: action = policy.choose_action(MDP.get_time()) print "[TIME", MDP.get_time() ,"]:", policy.get_name(), "chose action", action MDP.take_action(action) print "[TIME", MDP.get_time() ,"]: Moved to state", MDP.get_state(), "Current reward %.3f." % MDP.get_reward() print "Exited in (spot, handicapped, available):", MDP.get_spot(), MDP.get_handicapped(), MDP.get_available()
def evaluate_policies(policy, MDP): total_reward, handicapped, crashed = 0,0,0 num_sims = 10000 for i in range(num_sims): run_simulation(MDP, policy) #maybe do something fancier total_reward += MDP.get_reward() if MDP.get_handicapped(): handicapped += 1 if not MDP.get_available(): crashed += 1 MDP.reset() print policy.get_name(), total_reward / num_sims, handicapped, crashed
def SARSA_lambda(process: MDP, env: Environment, lambda_: float = 0.7, alpha: float = 0.01, n_iter: int = 5000, max_ep_len: int = 200): Q_value = np.zeros((process.nb_states, process.nb_actions)) for i in range(1, n_iter + 1): epsilon = 1 / i # ---- Init Eligibility Trace ---- E_t = np.zeros((process.nb_states, process.nb_actions)) current_state = env.get_random_state() counter = 1 ep_finished = False while not ep_finished: # ---- Update Policy ---- policy = process.get_Q_policy(Q_value, epsilon) env.policy = policy # ---- MDP stepping ---- current_action = env.generate_action(current_state) reward = env.generate_return(current_state, current_action) next_state = env.step(current_state, current_action) next_action = env.generate_action(next_state) # ---- Updating Eligibility Trace ---- E_t[current_state.index, current_action.index] += 1 E_t *= process.disc_fact * lambda_ # ---- Updating Q_value function ---- error = reward + process.disc_fact * Q_value[next_state.index, next_action.index] - \ Q_value[current_state.index, current_action.index] Q_value += alpha * error * E_t current_state = next_state current_action = next_action # ---- Stop Condition ---- counter += 1 if next_state.terminal: ep_finished = True if counter > max_ep_len: ep_finished = True return (process.get_Q_policy(Q_value))
def init(): """ Ask for a Gridworld file and initializes an MDP as environment and Q-learning object with it, then calls the menu. """ print_headline("Gridworld Selection") gridworld = read_gridworld_file() environment = MDP(state_list=gridworld, field_rewards=Default.FIELD_REWARDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, transition_probabilities=Default.TRANSITION_PROBABILITIES) q_learning = QLearning(env_perform_action=environment.perform_action, state_list=gridworld, goal_fields=Default.GOAL_FIELDS, obstacle_fields=Default.OBSTACLE_FIELDS, actions=Default.ACTIONS, discount_factor=Default.DISCOUNT_FACTOR, learning_rate=Default.LEARNING_RATE, epsilon=Default.EPSILON, convergence_threshold=Default.CONVERGENCE_THRESHOLD) print("Your input Gridworld:") print_gridworld(gridworld) while show_menu(q_learning): pass print_headline("See you later")
def test(): mdp = MDP(0.5) vFunc = MonteCarlo(mdp, *mdp.randomWalkSamples(100)) print('Monte Carlo:') for i in range(1, 6): print('%d: %f\t' % (i, vFunc[i]), end='') print() vFunc = temporalDifference(mdp, 0.15, *mdp.randomWalkSamples(100)) print('Temporal Difference:') for i in range(1, 6): print('%d: %f\t' % (i, vFunc[i]), end='') print()
def test_create_state(self): """ For implementation simplification I'm imagining creating all of the the states separately and then connecting them afterwards by specifying the actions. """ # States should be abled to be identified by numbers or by strings I suppose. # I don't imagine that strings will ever be used. mdp = MDP() mdp.add_state(0) mdp.add_state(1) mdp.add_state(2) mdp.add_state(3) mdp.add_state(4) mdp.add_state(5, terminal=True) self.assertEqual(mdp.num_states(), 6)
def run_simulation(MDP, policy, horizon): print "Starting simulation for", MDP while MDP.get_time() > 0: action = policy.choose_action(MDP.get_time()) print "[TIME", MDP.get_time(), "]:", policy.get_name( ), "chose action", action MDP.take_action(action) print "[TIME", MDP.get_time(), "]: Moved to state", MDP.get_state( ), "Current reward %.3f." % MDP.get_reward()
def __init__(self, grid, terminals, init=(0, 0), gamma=.9): MDP.__init__(self, init, actlist=orientations, terminals=terminals, gamma=gamma) self.grid = grid self.rows = len(grid) self.cols = len(grid[0]) # print(self.rows,self.cols) for x in range(self.cols): for y in range(self.rows): self.reward[y, x] = grid[y][x] if self.state_check((y, x)): self.states.add((y, x))
def main(): if len(sys.argv) < 5: print >> sys.stderr, "Usage: Simulator.py\t<MDP.txt>\t<RandomPolicy|OptimalPolicy>\t<Epsilon>\t<discount>\t<alpha>\t<training>" #sys.exit(-1) else: filename = sys.argv[1] training = float(sys.argv[6]) alpha = float(sys.argv[5]) discount = float(sys.argv[4]) epsilon = float(sys.argv[3]) user_policy = sys.argv[2] filename, epsilon, discount, alpha, training, user_policy = "example_1.mdp", .5, .9, .3, 200, "OptimalPolicy" transition_p, rewards, nStates, nActions = ReadMDP(filename) states = range(nStates) actions = range(nActions) initial_state = 0 transition_function = Transition(transition_p) MyMDP = MDP(states, actions, transition_function, rewards, initial_state) policy = ValueIterationPolicy(MyMDP, user_policy , epsilon, discount) policy.display_policy() print "" policy.display_value_f() #print policy.bellman_backup(initial_state, 10) MyMDP = MDP(states, actions, transition_function, rewards, initial_state) policy = RandomPolicy(MyMDP, "RandomPolicy") evaluate_policies(policy, MyMDP) #run_simulation(MyMDP, policy) MyMDP = MDP(states, actions, transition_function, rewards, initial_state) policy = GreedyPolicy(MyMDP, .3) #evaluate_policies(policy, MyMDP) #run_simulation(MyMDP, policy) MyMDP = MDP(states, actions, transition_function, rewards, initial_state) policy = ImpatientPolicy(MyMDP) # evaluate_policies(policy, MyMDP) #run_simulation(MyMDP, policy) MyMDP = MDP(states, actions, transition_function, rewards, initial_state) policy = NoHandicapPolicy(MyMDP, .3) # evaluate_policies(policy, MyMDP) #run_simulation(MyMDP, policy) MyMDP = MDP(states, actions, transition_function, rewards, initial_state) policy = QLearningPolicy(MyMDP, alpha, discount) run_training(MyMDP, policy, training) MyMDP.reset() evaluate_policies(policy, MyMDP)
def run_simulation(MDP, policy, epsilon): print "Starting simulation for", MDP while MDP.get_time() < epsilon: action = policy.choose_action(MDP.get_time()) print "[TIME", MDP.get_time() ,"]:", policy.get_name(), "chose action", action MDP.take_action(action) print "[TIME", MDP.get_time() ,"]: Moved to state", MDP.get_state(), "Current reward %.3f." % MDP.get_reward()
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.states = self.MDP.S self.Actions = self.MDP.A self.state_values = self.initVs() self.policy = {} self.gamma = discountRate def initVs(self): state_values = {} for state in self.states: state_values[state] = 0.0 return state_values def one_step_ahead(self, state): """ Function that calculates the value for all actions in a given state Args: state to be considered Returns: A dictionary with keys the actions that can be taken and as values the expected value of each action """ action_values = {} for action in self.Actions: transition_prob = self.MDP.probNextStates(state, action) total = 0.0 for next_state, probability in transition_prob.items(): reward = self.MDP.getRewards(state, action, next_state) total += probability * ( reward + self.gamma * self.state_values[next_state]) action_values[action] = total return action_values def BellmanUpdate(self): for state in self.states: Action_values = self.one_step_ahead(state) max_value = max(Action_values.values()) self.state_values[state] = max_value actions = [] for action in self.Actions: if Action_values[action] == max_value: actions.append(action) self.policy[state] = actions return (self.state_values, self.policy)
class BellmanDPSolver(object): def __init__(self,discount_rate): self.mpd = MDP() self.actions = self.mpd.A self.gamma = discount_rate self.policy = {} self.current_position = -1 def initVs(self): self.values = {(1, 3): 0, (3, 0): 0, (2, 1): 0, (0, 3): 0, (4, 0): 0, (1, 2): 0, (3, 3): 0, (4, 4): 0, (2, 2): 0, (4, 1): 0, (1, 1): 0, 'OUT': 0, (3, 2): 0, (0, 0): 0, (0, 4): 0, (1, 4): 0, (2, 3): 0, (4, 2): 0, (1, 0): 0, (0, 1): 0, 'GOAL': 0, (3, 1): 0, (2, 4): 0, (2, 0): 0, (4, 3): 0, (3, 4): 0, (0, 2): 0} self.policy = {(1, 3): [], (3, 0): [], (2, 1): [], (0, 3): [], (4, 0): [], (1, 2): [], (3, 3): [], (4, 4): [], (2, 2): [], (4, 1): [], (1, 1): [], 'OUT': [], (3, 2): [], (0, 0): [], (0, 4): [], (1, 4): [], (2, 3): [], (4, 2): [], (1, 0): [], (0, 1): [], 'GOAL': [], (3, 1): [], (2, 4): [], (2, 0): [], (4, 3): [], (3, 4): [], (0, 2): []} def BellmanUpdate(self): for init_state, value_f in self.values.items(): max=None for action in self.actions: temp = 0 # Transition Table p(s',r | s,a) next_states = self.mpd.probNextStates(init_state,action) for new_state,prob in next_states.items(): temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state]) if max == None or temp >= max: max = temp self.values[init_state]= max # Greedily compute new policy policy_list = [] max = self.values[init_state] for action in self.actions: temp = 0 next_states = self.mpd.probNextStates(init_state,action) for new_state,prob in next_states.items(): temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state]) if temp == max: policy_list.append(action) self.policy[init_state]= policy_list return self.values,self.policy
def fullMDP( self ): rewards = np.zeros( self.nstates * self.nactions ) for i in range( self.nrewardfactors ): rewards = rewards + self.rewardstruct[i].mapping @ self.rewardstruct[i].params kernel = np.ones( ( self.nstates * self.nactions, self.nstates ) ) for i in range( self.nstatefactors ): kernel = kernel * ( self.transitionstruct[i].mapping @ self.transitionstruct[i].params @ self.statemappings[i] ) return MDP( self.nstates, self.nactions, rewards / self.nrewardfactors, kernel )
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.states = MDP().S self.action = MDP().A self.discountRate = discountRate self.Values = {} self.Values['GOAL'] = 0 self.Values['OUT'] = 0 self.Policy = {} self.initVs() def initVs(self): for x in range(5): for y in range(5): self.Values[(x, y)] = 0 self.Policy[(x, y)] = [ "DRIBBLE_UP", "DRIBBLE_DOWN", "DRIBBLE_LEFT", "DRIBBLE_RIGHT", "SHOOT" ] def BellmanUpdate(self): values = {} for i in range(5): for j in range(5): values[(i, j)] = -np.inf for action in self.action: tmp_values = 0.0 nextstateprob = self.MDP.probNextStates((i, j), action) for nextstate in nextstateprob.keys(): reward = self.MDP.getRewards((i, j), action, nextstate) prob = nextstateprob[nextstate] tmp_values = tmp_values + ( prob * (reward + self.discountRate * self.Values[nextstate])) if values[(i, j)] < tmp_values: values[(i, j)] = tmp_values self.Policy[(i, j)] = [action] elif values[(i, j)] == tmp_values: self.Policy[(i, j)].append(action) self.Values[(i, j)] = values[(i, j)] return self.Values, self.Policy
class BellmanDPSolver(object): def __init__(self, discount=0.9, theta=1e-4): self.MDP = MDP() self.discount = discount self.theta = theta self.initval, self.policy = self.initVs() def initVs(self): initval = {} policy = {} L1 = self.MDP.S for i in L1: initval[i] = 0 # all the action policy[i] = self.MDP.A return initval, policy def BellmanUpdate(self): for states in self.MDP.S: nextV = {} for action in self.MDP.A: nextStateProb = self.MDP.probNextStates(states, action) value = 0 for nextsta in nextStateProb: immr = self.MDP.getRewards(states, action, nextsta) value += nextStateProb[nextsta] * ( immr + self.discount * self.initval[nextsta]) nextV[action] = value self.initval[states] = max(nextV.values()) # select the corresponding optimal action and fill in the policy dic self.policy[states] = [ key for key, value in nextV.items() if value == max(nextV.values()) ] return self.initval, self.policy
def start_grid_mdp(): """ starts the program, restarts if the user wants to """ grid = load_grid(get_file_path()) world = GridWorld(grid) move_costs = get_move_cost() gamma = get_gamma() eval_steps = get_evaluation_steps() MDP(world, eval_steps, gamma, move_costs) if start_again(): start_grid_mdp()
def test_add_action(self): """ Test that you can add an action. Named actions make more sense than named states """ mdp = MDP() mdp.add_action(0) self.assertEqual(mdp.num_actions(), 1) self.assertEqual(type(mdp.get_action(0)), Action) self.assertIn(mdp.get_action(0), mdp.get_action_list())
def initialise_mdp(self, state): try: blocks = [] for prop in state.initial_state.block_properties: blocks.append(Block(prop.label, prop.shape, prop.colour, prop.size)) start_config = state.initial_state.configuration.config startingState = State(0, start_config) self.initialise_lists() self.success_config[-1].append(startingState) label = len(self.mdp_list[-1]) print "" print label print "" mdp = MDP(label, blocks) mdp.statelist.append(startingState) mdp.initMDP(startingState) self.mdp_list[-1].append(mdp) print "MDP initialised" return True except: return False
def GLIE(process: MDP, env: Environment, n_iter: int = 5000, eps: float = 0.01): Q_value = np.zeros((process.nb_states, process.nb_actions)) count_state_action = np.zeros((process.nb_states, process.nb_actions)) for i in range(1, n_iter + 1): epsilon = 1 / i policy = process.get_Q_policy(Q_value, epsilon) env.policy = policy states, actions, returns = env.generate_episode() G = 0 for j in range(len(returns) - 1, 0, -1): G = process.disc_fact * G + returns[j] current_state = states[j] current_action = actions[j] count_state_action[current_state.index, current_action.index] += 1 Q_value[current_state.index, current_action.index] += \ (G - Q_value[current_state.index, current_action.index]) / count_state_action[current_state.index, current_action.index] return process.get_Q_policy(Q_value)
def test_mdp_size(self): """ Probably nice to be able to tell the size of the MDP. If not only for tests """ mdp = MDP() self.assertEqual(mdp.num_states(), 0) mdp = MDP(5) self.assertEqual(mdp.num_states(), 5) mdp = MDP() mdp.add_state(0) mdp.add_state(1) self.assertEqual(mdp.num_states(), 2)
def Q_learning(process: MDP, env: Environment, lambda_: float = 0.7, alpha: float = 0.01, n_iter: int = 5000, max_ep_len: int = 200): Q_value = np.zeros((process.nb_states, process.nb_actions)) for i in range(1, n_iter + 1): epsilon = 1 / i current_state = env.get_random_state() counter = 1 ep_finished = False while not ep_finished: # ---- Update Policy ---- policy = process.get_Q_policy(Q_value, epsilon) env.policy = policy # ---- MDP stepping ---- current_action = env.generate_action(current_state) reward = env.generate_return(current_state, current_action) next_state = env.step(current_state, current_action) # ---- Updating Q_value function ---- Q_value[current_state.index, current_action.index] += alpha * (reward + \ process.disc_fact * Q_value[next_state.index,:].max() - \ Q_value[current_state.index, current_action.index]) current_state = next_state # ---- Stop Condition ---- counter += 1 if next_state.terminal: ep_finished = True if counter > max_ep_len: ep_finished = True return (process.get_Q_policy(Q_value))
def test_add_transition(self): """ S, A, S', P Takes in a state, one of the actions, the state that we'll end up in after taking an action, and the probability that this transition occurs. """ mdp = MDP(5) mdp.add_action(0) s = mdp.get_state(0) a = mdp.get_action(0) s_prime = mdp.get_state(1) mdp.add_transition(s, a, s_prime, 1.0) transition_function = mdp.get_transition_function() self.assertEqual(s.id, a.id)
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.dr = discountRate self.S = [(x,y) for x in range(5) for y in range(5)] self.S.append("GOAL") self.S.append("OUT") self.A = ["DRIBBLE_UP","DRIBBLE_DOWN","DRIBBLE_LEFT","DRIBBLE_RIGHT","SHOOT"] self.oppositions = [(2,2), (4,2)] self.goalProbs = [[0.00,0.00,0.0,0.00,0.00],[0.0, 0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0],[0.0,0.3,0.5,0.3,0.0],[0.0,0.8,0.9,0.8,0.0]] self.vs ={} self.act = {} def initVs(self): for s in self.S: self.vs[s] = 0.0 def BellmanUpdate(self): for s in self.S: temp = -100000000 action = " " value_max= [] action_max = [] for a in self.A: nextState = self.MDP.probNextStates(s,a) value_a = 0.0 for s_next,prob in nextState.items(): reward = self.MDP.getRewards(s,a,s_next) value_next = self.vs[s_next] value_a += prob*(reward + self.dr*value_next) value_max.append(value_a) self.vs[s]= np.max(value_max) for i in range(len(self.A)): if value_max[i] == np.max(value_max): action_max.append(self.A[i]) self.act[s] = action_max return self.vs,self.act
def valueIterationActions(MDP, gamma, delta): """ The value iteration algorithm calculates the utility of each state in the MDP. This utility is then used to determine which action a robot should take, given it's current state and possible actions from that state. Once the change in utility is less than delta, the while loop terminates """ # this will hold all utility information U = util.Counter() allActions = MDP.get_actions() # when this term become true, the while loop terminates keep_iterating = True while keep_iterating: maxDeltU = 0 # loop through each state on each iteration for mdpState in MDP.states: # will store the maximum utility out of all possible actions maxVal = 0 for action in allActions[mdpState]: # will sum the utilities over all possible states from that action total = 0 for (nextState, prob) in MDP.transModel[(mdpState, action)].items(): total += prob * (MDP.calc_rewards(mdpState, nextState) + gamma * U[nextState]) maxVal = max(maxVal, total) Uprev = U[mdpState] # update the utility for this state U[mdpState] = maxVal # if the utility value changes by less than delta, then stop iterating deltU = abs(Uprev - U[mdpState]) maxDeltU = max(maxDeltU, deltU) if maxDeltU < delta: keep_iterating = False return U
def test_get_state(self): """ Should probably be some nice way to get states by a state id or state name """ # test that you can get a state by numerical id mdp = MDP() mdp.add_state(0) self.assertEquals(type(mdp.get_state(0)), State) self.assertIn(mdp.get_state(0), mdp.get_state_list())
def get_lily_pads_mdp(n: int) -> MDP: data = { i: { 'A': ({ i - 1: i / n, i + 1: 1. - i / n }, 1 / n if i == n - 1 else 0.), 'B': ({j: 1 / n for j in range(n + 1) if j != i}, 1 / n) } for i in range(1, n) } data[0] = {'A': ({0: 1.}, 0.), 'B': ({0: 1.}, 0.)} data[n] = {'A': ({n: 1.}, 0.), 'B': ({n: 1.}, 0.)} gamma = 1.0 return MDP(data, gamma)
def generateMDPs(self): self.mdps = [] results = [] p = Pool(processes=(self.num_agents + 2)) for i in xrange(0, self.num_agents): print "Generating MDP for Agent" + str(i) a = MDP(i, self.config) self.mdps.append(a) res = p.amap(self._instance_method_alias_call, self.mdps) self.mdps = res.get() sum = 0 for m in self.mdps: sum += m.numberVariables print "Total Number of Variables: ", sum
def test_3states_1action_init2(self): self.assertMDP( MDP(transition=[{ "a1": { 1: 1 } }, { "a1": { 2: 1 } }, { "a1": { 0: .2, 1: .8 } }], rewards=[{ "a1": {} }, { "a1": {} }, { "a1": { 0: 10 } }], gamma=0.9, default_reward=0), { 0: 2.891, 1: 4.018, 2: 4.804 }, { 0: 'a1', 1: 'a1', 2: 'a1' }, init={ 0: -7, 2: -9 }, # test with dict init (+ bad init values) max_rounds=10, places=3)
def runValueIteration(world, robot1, robot2): """ This function creates robot states, actions, a transition model, and a gamma. Then, using these models, it creates 2 robot classes. Then, It develops an MDP for the two robots. Finally, it runs a value iteration algorithm on the MDP to develop a utility function for each MDP state :param world: :param robot1: :param robot2: :param gamma: :param delta: :return: """ # MDP object mdp = MDP(world, robot1, robot2) # utility function for the MDP U = valueIteration(mdp, world.gamma, world.delta) return U
def main(): if len(sys.argv) < 4: print >> sys.stderr, "Usage: Simulator.py\t<MDP.txt>\t<RandomPolicy|OptimalPolicy>\t<Horizon>" sys.exit(-1) filename = sys.argv[1] transition_p, rewards, nStates, nActions = ReadMDP(filename) horizon = int(sys.argv[3]) states = range(nStates) actions = range(nActions) initial_state = 0 user_policy = sys.argv[2] transition_function = Transition(transition_p) MyMDP = MDP(states, actions, transition_function, rewards, initial_state, horizon) policy = ValueIterationPolicy(MyMDP, user_policy, horizon) policy.display_policy() print "" policy.display_value_f()
def run_training(MDP, policy, horizon): t = 0 trajectory = [] while t < horizon: if MDP.get_parked(): # we need to make it do one more update. action = policy.choose_training_action() state = MDP.get_state() trajectory.append((state, action, MDP.get_state())) #reset our simulator MDP.reset() policy.q_updates(trajectory) trajectory = [] else: #record trajectory action = policy.choose_training_action() state = MDP.get_state() policy.take_action(action) trajectory.append((state, action, MDP.get_state())) t += 1
def test_add_string_action(self): mdp = MDP() mdp.add_action("jump") self.assertEqual(mdp.num_actions(), 1) self.assertEqual(type(mdp.get_action("jump")), Action) self.assertIn(mdp.get_action("jump"), mdp.get_action_list())
def run_simulation(MDP, policy): #print "Starting simulation for given MDP" while not MDP.get_parked(): action = policy.choose_action(MDP.get_time()) #print "[TIME", MDP.get_time() ,"]:", policy.get_name(), "chose action", action policy.take_action(action)
def test_get_num_actions(self): """ It will be helpful to be able to return the number of distinct actions that an MDP has. """ mdp = MDP() mdp.add_action(0) mdp.add_action(1) mdp.add_action(2) mdp.add_action(3) mdp.add_action(4) mdp.add_action(5) mdp.add_action(6) action_list = mdp.get_action_list() self.assertEqual(len(action_list), 7) self.assertIn(mdp.get_action(0), action_list)
value_fcn = pickle.load(handle) ''' #set up grid world mdp ''' grid_mdp = GridWorldMDP(map_struct['seed_map'], map_struct['goal']) ''' grid_mdp = GridWorldMDP(map_struct['seed_map'], map_struct['goal'], map_struct['start'], map_struct['bridge_probabilities'], map_struct['bridge_locations']) init_value = {} for s in grid_mdp.states: init_value[s.tostring()] = np.linalg.norm(s - grid_mdp.goal_state) mdp = MDP(grid_mdp.states, grid_mdp.valid_actions_function, grid_mdp.cost_function) #value_fcn = mdp.value_iteration(value = value_fcn, plot=True, world_size = 50) value_fcn = mdp.value_iteration(value = init_value, plot=True, world_size = 50) #set up dubins astar dub = dubins_astar(world_points, value_fcn) astar = AStar(motion_primitives, dub.cost_function, dub.heuristic, dub.valid_edge, dub.state_equality, plot = False) astar_state = np.array([state['x'],state['y'],state['theta']]) else: ''' following_dist = 0.0 temp_idx = dub.last_idx while following_dist < dub.look_ahead_dist temp_idx -= 1
def test_get_non_existent_state(self): mdp = MDP(5) with self.assertRaises(IndexError): self.assertRaises(mdp.get_state(11), IndexError)
def test_add_duplicate_state(self): mdp = MDP() mdp.add_state(0) with self.assertRaises(KeyError): mdp.add_state(0)
def test_add_duplicate_action(self): mdp = MDP() mdp.add_action("jump") with self.assertRaises(KeyError): # RunTime error is probably more appropriate? mdp.add_action("jump")