def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE mdp.computeStates() allStates = mdp.states # Run value iteration. solver = util.ValueIteration() solver.solve(mdp) optimalVIPolicy = solver.pi # Run Q-Learning algorithm and compute its optimal policy. ql = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=featureExtractor) util.simulate(smallMDP, ql, numTrials=30000, maxIterations=10000) ql.explorationProb = 0.0 optimalQLPolicy = {state: ql.getAction(state) for state in allStates} # Compute some statistics numDifferent = sum(1 for state in allStates if optimalQLPolicy[state] != optimalVIPolicy[state]) print("{} out of {} states have different actions".format( numDifferent, len(allStates)))
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) #actions discount feature extractor util.simulate(mdp, rl, numTrials=30000) rl.explorationProb = 0 valueIter = util.ValueIteration() valueIter.solve(mdp) numberOfStates = 0 numberOfDifferentStates = 0 for state in mdp.states: if state not in valueIter.pi: file.write('Pi does not contain state {}\n'.format(state)) else: if valueIter.pi[state] != rl.getAction(state) and state[2] != None: numberOfDifferentStates += 1 file.write('In state {} Pi gives action {}, but RL gives action {}\n'.format(state, valueIter.pi[state], rl.getAction(state))) numberOfStates += 1 file.write('\n % of different actions = {}%\n'.format(numberOfDifferentStates/numberOfStates*100)) for weight in rl.weights: file.write('weight ({}) = {} \n'.format(weight, rl.weights[weight]))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE val = util.ValueIteration() val.solve(original_mdp) val_policy = val.pi RL1 = util.FixedRLAlgorithm(val_policy) result1 = util.simulate(modified_mdp, RL1, numTrials=50000, maxIterations=1000, verbose=False, sort=False) avg1 = sum(result1) / float(len(result1)) print(avg1) RL2 = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, explorationProb=0.2) result2 = util.simulate(modified_mdp, RL2, numTrials=50000, maxIterations=1000, verbose=False, sort=False) avg2 = sum(result2) / float(len(result2)) print(avg2)
def simulate_QL_over_MDP(MDP, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE # pass RL = QLearningAlgorithm(MDP.actions, MDP.discount(), featureExtractor, explorationProb=0) util.simulate(MDP, RL, numTrials=30000, maxIterations=1000, verbose=False, sort=False) MDP.computeStates() RL_policy = {} for state in MDP.states: RL_policy[state] = RL.getAction(state) val = util.ValueIteration() val.solve(MDP) val_policy = val.pi sum_ = [] for key in RL_policy: if RL_policy[key] == val_policy[key]: sum_.append(1) else: sum_.append(0) print(float(sum(sum_)) / len(RL_policy)) return RL_policy, val_policy
def test_hidden(self): """3a-hidden: Hidden test for ValueIteration. Run ValueIteration on BlackjackMDP, then test if V[startState] is correct.""" mdp = submission.BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1) startState = mdp.startState() alg = util.ValueIteration() alg.solve(mdp, .0001)
def test3aHidden(): mdp = submission.BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1) startState = mdp.startState() alg = util.ValueIteration() alg.solve(mdp, .0001)
def Q4c(): # s = (3, None, (3,4,0)) # fv = blackjackFeatureExtractor(s,'Take') # print "for state %s , action 'Take' ... \n ... feature vector returned: %s" %(s,fv) print "Comparing value iteration ag simulated Q-learning as in 4b but using better featureExtractor:" phi = blackjackFeatureExtractor mdp = smallMDP #smallMDP #TOGGLE THIS numqtrials = 100 #CHANGE THIS : eg 10, 10000, 300000 print "...comparison for %s x %s MDP; Q-learning numtrials : %s" % ( mdp.cardValues, mdp.multiplicity, numqtrials) # value iteration: solver = util.ValueIteration() #algorithm instantiated solver.solve(mdp) #algo applied to the MDP problem # q-learning simulate : rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=phi, explorationProb=0.2) totPVs = util.simulate( mdp, rl, numTrials=numqtrials, verbose=False) #returns list of totRewards for each trial print " ........ # non-zero weights = %s" % sum( [1 for k, v in rl.weights.items() if v]) Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print "\n...Comparison of Vopt : " print " ... value iteration = expected optimal PV :: optimal utility of startState, stdev: ( %s, 0 )" % ( solver.V[mdp.startState()]) print " ... q-learning: avg PV :: utility, stdev over all trials: ( %s, %s ) (see note * below)" % ( statistics.mean(totPVs), statistics.stdev(totPVs)) print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est # plotQL(totPVs) # Comparison of VI and QL policies: print "\n...Comparison of policies (rerun with explorationProb = 0) : " rl.explorationProb = 0 # rerun QL now with 0 exploration prob (since learned) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) #reruns simulation Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est diffs = 0 #counts number of differences in policy btw VI and QL for s, p in solver.pi.items( ): # using value-iteration policy as starting point rlp = max((dotProduct(rl.weights, dict(phi(s, a))), a) for a in rl.actions(s))[1] if rlp != p: diffs += 1 print "rlp : %s does not equal VIp : %s for state %s" % (rlp, p, s) print "number of different policies btw VI and QL , out of total : %s / %s = %4.2f" % ( diffs, len(solver.pi), diffs / (1.0 * len(solver.pi)))
def test_util(): print("Testing util module : ") print("...creating simple mdp instance ... ") mdp = util.NumberLineMDP() #instance of an MDP problem solver = util.ValueIteration() #algorithm instantiated solver.solve(mdp) #algo applied to the MDP problem print "Vopt : %s " % solver.V print "optimal_policy : %s " % solver.pi print("... done test_util.\n")
def simulate_QL_over_MDP(mdp, featureExtractor, verbose=False): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # 4b : identityFeatureExtractor, RL peut performant car la fonction choisie # Phi est particulièrement peu généralisable (fonction indicatrice de (s,a))) # BEGIN_YOUR_CODE print ("simulate_QL_over_MDP") # Résolution via Value Iteration vi = util.ValueIteration() vi.solve(mdp, .0001) pi_vi = vi.pi # pi computed with value iteration if verbose: print('len pi_vi : {}'.format(len(pi_vi))) # Résolution via Q-Learning mdp.computeStates() rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, 0.05) # meilleur qu'avec un taux d'exploration de 0.2 util.simulate(mdp, rl, 30000) # util.simulate(mdp, rl, numTrials=30000, maxIterations=1000) # On connait l'ensemble des états possibles de notre mdp grace à la variable mdp.states (attribut de mdp) # Cet attribut est initialisé avec l'appel à la méthode computeStates pi_rl = rl.get_pi_opt(mdp.states) # pi computed with Q-learning (RL) if verbose: print('len pi_rl : {}'.format(len(pi_rl))) if verbose: print('pi : ') print('Value Iteration') print('Reinforcement Learning') print('---') for state in mdp.states: print('{} : {}'.format(state, pi_vi[state])) print('{} : {}'.format(state, pi_rl[state])) print('---') print('Stats') print 'Nb d\'états possibles : ', len(mdp.states) equal = 0. for state in mdp.states: # Liste des clés pi_rl (inclus dans pi_vi, car pi_vi exhaustif) if pi_vi[state] == pi_rl[state]: equal += 1 print('Egalités : {0:.2f} %'.format(equal / len(mdp.states) * 100)) print('---')
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE print ("compare_changed_MDP") # Résolution via Value Iteration # pi_original_mdp vio = util.ValueIteration() vio.solve(original_mdp, .0001) pi_original_mdp = vio.pi # pi computed with value iteration print 'Récompenses original_mdp calculée via VI pour startState : ' print vio.V[original_mdp.startState()] # modified_mdp vim = util.ValueIteration() vim.solve(modified_mdp, .0001) pi_modified_mdp = vim.pi # pi computed with value iteration print 'Récompenses modified_mdp calculée via VI pour startState : ' print vim.V[modified_mdp.startState()] # Exploitation de la stratégie Pi définie sur original_mdp # en l'appliquant via la simulation sur le nouvel mdp modified_mdp fixed_rl = util.FixedRLAlgorithm(pi_original_mdp) # totalRewards = util.simulate(newThresholdMDP, fixed_rl, 30000) totalRewards = util.simulate(modified_mdp, fixed_rl, numTrials=30000, maxIterations=1000, verbose=False, sort=False) print('Moyenne des récompenses sur le nouvel MDP en utilisant la Pi de l\'ancien MDP : ') print sum(totalRewards) / len(totalRewards) # Résolution via Q-Learning sur original_mdp original_mdp.computeStates() rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor, 0.2) totalRewards = util.simulate(original_mdp, rl, 30000) print('Moyenne des récompenses sur l\'ancien MDP en utilisant RL: ') print sum(totalRewards) / len(totalRewards) totalRewards = util.simulate(modified_mdp, rl, 30000) print('Moyenne des récompenses sur le nouveau MDP en utilisant RL entraîné sur l\'ancien: ') print sum(totalRewards) / len(totalRewards)
def Q4d(): origMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) newThreshMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=9, peekCost=1) #run VI on original MDP to obtain policy: solver = util.ValueIteration() #algorithm instantiated solver.solve(origMDP) #algo applied to the MDP problem print " ... VI Vopt(startState) = %s ." % (solver.V[origMDP.startState()]) pi0 = solver.pi # apply this policy to an agent (in simulated mdp) playing the **new** MDP: numqtrials = 30000 rl = util.FixedRLAlgorithm(pi0) mdp = origMDP totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) print " ... QL: avg PV, stdev using above VI opt policy on same mdp: ( %s, %s ) " % ( statistics.mean(totPVs), statistics.stdev(totPVs)) mdp = newThreshMDP totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) print "\n ... QL: avg PV, stdev using above VI opt policy on *NEW* mdp: ( %s, %s ) " % ( statistics.mean(totPVs), statistics.stdev(totPVs)) # now skip the fixed policy and use QL : phi = identityFeatureExtractor #blackjackFeatureExtractor rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=phi, explorationProb=0.5) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... QL: est. Vopt of startState : %s " % Vopt_est # plotQL(totPVs) # Comparison of VI and QL policies: rl.explorationProb = 0 # rerun QL now with 0 exploration prob (since learned) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) #reruns simulation Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... QL: est. Vopt of startState re-run (with eps = 0) : %s " % Vopt_est
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valueIterOriginal = util.ValueIteration() valueIterOriginal.solve(original_mdp) fixedRL = util.FixedRLAlgorithm(valueIterOriginal.pi) rewards = util.simulate(modified_mdp, fixedRL) print("Fixed RL") for reward in rewards: print(reward) rewardsFromQ = util.simulate(modified_mdp, QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor)) print('QLearn') for reward in rewardsFromQ: print(reward)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE ValIter = util.ValueIteration() ValIter.solve(original_mdp) policyVal = ValIter.pi Fixed = FixedRLAlgorithm(policyVal) print('Rewards for value iteration: ', util.simulate(newThresholdMDP, Fixed)) QL = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), featureExtractor, explorationProb=0) print('Rewards for Q-learning iteration: ', util.simulate(newThresholdMDP, QL))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE solver = util.ValueIteration() solver.solve(original_mdp) optimalVIOriginalMDPPolicy = solver.pi fixedRL = util.FixedRLAlgorithm(optimalVIOriginalMDPPolicy) rewards = util.simulate(modified_mdp, fixedRL, numTrials=30000, maxIterations=10000) print("Sampled average reward for optimal policy from original MDP is {}.". format(sum(rewards) / float(len(rewards)))) # Train Q-learning. ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=modified_mdp.discount(), featureExtractor=featureExtractor) trainingRewards = util.simulate(modified_mdp, ql, numTrials=30000, maxIterations=10000) print( "Sampled average reward for Q-Learning during training is {}.".format( sum(trainingRewards) / float(len(trainingRewards)))) ql.explorationProb = 0.0 modified_mdp.computeStates() learnedQLPolicy = { state: ql.getAction(state) for state in modified_mdp.states } fixedQLRL = util.FixedRLAlgorithm(learnedQLPolicy) rewardsQL = util.simulate(modified_mdp, fixedQLRL, numTrials=30000, maxIterations=10000) print( "Sampled average reward for policy learned directly on new problem with Q-Learning is {}." .format(sum(rewardsQL) / float(len(rewardsQL))))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE vi = util.ValueIteration() vi.solve(original_mdp) summ = 0 events = 0 for i in util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000): summ += i events += 1 print(summ * 1.0 / events) Qlearning = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor) summ = 0 events = 0 for i in util.simulate(modified_mdp, Qlearning, 100): summ += i events += 1 print(summ * 1.0 / events)
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE QL = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, explorationProb=0) util.simulate(mdp, QL, numTrials=30000, maxIterations=1000) #To calculate mdp.computeStates() policyQL = {} for s in mdp.states: policyQL[s] = QL.getAction(s) # valueiteration ValIter = util.ValueIteration() ValIter.solve(mdp) policyVal = ValIter.pi Intersection = [1 if policyQL[k] == policyVal[k] else 0 for k in policyQL] print('MDP accuracy is: ', sum(Intersection) / len(policyQL)) return
def main(): vi = util.ValueIteration() vi.solve(MDP())
num_all_cards = sum(deck) for idx, num in enumerate(deck): if num == 0: continue prob = num/sum(deck) succ_prob_reward_list.append(((card_sum, idx, deck) , prob, - self.peekCost)) # HINT: has the form (new_card_sum, new_peek_idx, new_deck) # ---------------------------------------- Peek implementation elif action == 'Quit': succ_prob_reward_list.append(((card_sum, None, None), 1, card_sum)) else: raise ValueError("Undefined action '{}'".format(action)) return succ_prob_reward_list # END_YOUR_CODE def discount(self): return 1 if __name__ == '__main__': mdp = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) algorithm = util.ValueIteration() algorithm.solve(mdp, verbose=0) for s in algorithm.pi: print(f'pi({s}) = {algorithm.pi[s]}')
def Q4b(): print "Comparing value iteration ag simulated Q-learning :" mdp = largeMDP #TOGGLE THIS numqtrials = 30000 #CHANGE THIS : eg 10, 10000, 300000 print "...comparison for %s x %s MDP; Q-learning numtrials : %s" % ( mdp.cardValues, mdp.multiplicity, numqtrials) # value iteration solver = util.ValueIteration() #algorithm instantiated solver.solve(mdp) #algo applied to the MDP problem # q-learning simulate : phi = identityFeatureExtractor # phi = blackjackFeatureExtractor rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=phi, explorationProb=0.2) # simulate_QL_over_MDP(mdp, rl) totPVs = util.simulate( mdp, rl, numTrials=numqtrials, verbose=False) #returns list of totRewards for each trial # print " ........ totPVs : %s " %totPVs print " ........ # non-zero weights = %s" % sum( [1 for k, v in rl.weights.items() if v]) # Vopt_est = max(rl.weights[(mdp.startState(),a)] for a in rl.actions(mdp.startState() ) ) Vopt_est = max(rl.weights[(mdp.startState(), a)] for a in rl.actions(mdp.startState())) print "...Comparison of Vopt : " print " ... value iteration = expected optimal PV :: optimal utility of startState, stdev: ( %s, 0 )" % ( solver.V[mdp.startState()]) print " ... q-learning: avg PV :: utility, stdev over all trials: ( %s, %s ) (see note * below)" % ( statistics.mean(totPVs), statistics.stdev(totPVs)) print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est # plotQL(totPVs) print "...Comparison of policies (rerun with explorationProb = 0) : " # rerun QL now with 0 exploration prob (since learned) rl.explorationProb = 0 totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) #reruns simulation Vopt_est = max(rl.weights[(mdp.startState(), a)] for a in rl.actions(mdp.startState())) print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est print " ... # non-zero weights = %s" % sum( [1 for k, v in rl.weights.items() if v]) #sample weights : # s = mdp.startState() # print "weights for startState : %s" %{k:v for k,v in rl.weights.items() if k[0] == s} # print "--> vip = %s" %max((rl.weights[(s,a)],a) for a in rl.actions(s) )[1] diffs = 0 #counts number of differences in policy btw VI and QL for s, p in solver.pi.items( ): # using value-iteration policy as starting point vip = max((rl.weights[(s, a)], a) for a in rl.actions(s))[1] if vip != p: diffs += 1 print "number of different policies btw VI and QL , out of total : %s / %s = %4.2f" % ( diffs, len(solver.pi), diffs / (1.0 * len(solver.pi)))
def mdpsolve(mdp): solver = util.ValueIteration() #algorithm instantiated solver.solve(mdp) #algo applied to the MDP problem print "Vopt : %s " % solver.V print "optimal_policy : %s " % solver.pi
policy_filename = results.output_policy_fn value_filename = results.output_value_fn print 'loading pkl' all_flights = {} with open(r"airport_to_flights_dict.pkl", "rb") as input_file: all_flights = pickle.load(input_file) print 'done loading pkl' np.random.seed(11) mdp = FlightMDP(initial_origin=initial_origin, start_time=datetime.datetime(2015, 1, 11, 8, 30), final_destination=destination, prune_direct=prune_direct) alg = util.ValueIteration() alg.solve(mdp, epsilon) with open(value_filename, 'wb') as handle: pickle.dump(alg.V, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(policy_filename, 'wb') as handle: pickle.dump(alg.pi, handle, protocol=pickle.HIGHEST_PROTOCOL) print 'dumped new policies' print 'printing final path' state = mdp.startState() path = [(state, None)] while True: print '\n'
featureKey = (state, action) featureValue = 1 return [(featureKey, featureValue)] ############################################################ # Problem 4b: convergence of Q-learning # Small test case smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) smallMDP.computeStates() ValueIterationSolution = util.ValueIteration() ValueIterationSolution.solve(smallMDP) rl = QLearningAlgorithm(smallMDP.actions, smallMDP.discount(), identityFeatureExtractor) util.explorationProb = 0 util.simulate(smallMDP, rl, numTrials=30000, maxIterations=1000, verbose=False, sort=False) similar = 0.0 total = 0.0 for s in smallMDP.states: