def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE ql = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) q_rewards = util.simulate(mdp, ql, 30000) avg_reward_q = float(sum(q_rewards)) / len(q_rewards) vi = ValueIteration() vi.solve(mdp) rl = util.FixedRLAlgorithm(vi.pi) vi_rewards = util.simulate(mdp, rl, 30000) avg_reward_vi = float(sum(vi_rewards)) / len(vi_rewards) ql.explorationProb = 0 ql_pi = {} for state, _ in vi.pi.items(): ql_pi[state] = ql.getAction(state) p_vi = vi.pi diff = 0 for state in vi.pi.keys(): if vi.pi[state] != ql_pi[state]: diff += 1 print("difference", diff, "over " + str(len(p_vi.keys())) + " states") print("percentage diff ", float(diff) / len(p_vi.keys())) print("avg_reward_diff", avg_reward_q - avg_reward_vi)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE original_mdp.computeStates() vi = ValueIteration() vi.solve(originalMDP) rl = util.FixedRLAlgorithm(vi.pi.copy()) rewards = util.simulate(modified_mdp, rl, numTrials=10000, maxIterations=1000, verbose=False, sort=False) rl.explorationProb = 0.0 #print(rewards) modified_mdp.computeStates() rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, 0.2) rewards = util.simulate(modified_mdp, rl, numTrials=10000, maxIterations=1000, verbose=False, sort=False)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE val = util.ValueIteration() val.solve(original_mdp) val_policy = val.pi RL1 = util.FixedRLAlgorithm(val_policy) result1 = util.simulate(modified_mdp, RL1, numTrials=50000, maxIterations=1000, verbose=False, sort=False) avg1 = sum(result1) / float(len(result1)) print(avg1) RL2 = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, explorationProb=0.2) result2 = util.simulate(modified_mdp, RL2, numTrials=50000, maxIterations=1000, verbose=False, sort=False) avg2 = sum(result2) / float(len(result2)) print(avg2)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE solver = util.ValueIteration() solver.solve(original_mdp) optimalVIOriginalMDPPolicy = solver.pi fixedRL = util.FixedRLAlgorithm(optimalVIOriginalMDPPolicy) rewards = util.simulate(modified_mdp, fixedRL, numTrials=30000, maxIterations=10000) print("Sampled average reward for optimal policy from original MDP is {}.". format(sum(rewards) / float(len(rewards)))) # Train Q-learning. ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=modified_mdp.discount(), featureExtractor=featureExtractor) trainingRewards = util.simulate(modified_mdp, ql, numTrials=30000, maxIterations=10000) print( "Sampled average reward for Q-Learning during training is {}.".format( sum(trainingRewards) / float(len(trainingRewards)))) ql.explorationProb = 0.0 modified_mdp.computeStates() learnedQLPolicy = { state: ql.getAction(state) for state in modified_mdp.states } fixedQLRL = util.FixedRLAlgorithm(learnedQLPolicy) rewardsQL = util.simulate(modified_mdp, fixedQLRL, numTrials=30000, maxIterations=10000) print( "Sampled average reward for policy learned directly on new problem with Q-Learning is {}." .format(sum(rewardsQL) / float(len(rewardsQL))))
def Q4d(): origMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) newThreshMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=9, peekCost=1) #run VI on original MDP to obtain policy: solver = util.ValueIteration() #algorithm instantiated solver.solve(origMDP) #algo applied to the MDP problem print " ... VI Vopt(startState) = %s ." % (solver.V[origMDP.startState()]) pi0 = solver.pi # apply this policy to an agent (in simulated mdp) playing the **new** MDP: numqtrials = 30000 rl = util.FixedRLAlgorithm(pi0) mdp = origMDP totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) print " ... QL: avg PV, stdev using above VI opt policy on same mdp: ( %s, %s ) " % ( statistics.mean(totPVs), statistics.stdev(totPVs)) mdp = newThreshMDP totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) print "\n ... QL: avg PV, stdev using above VI opt policy on *NEW* mdp: ( %s, %s ) " % ( statistics.mean(totPVs), statistics.stdev(totPVs)) # now skip the fixed policy and use QL : phi = identityFeatureExtractor #blackjackFeatureExtractor rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=phi, explorationProb=0.5) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... QL: est. Vopt of startState : %s " % Vopt_est # plotQL(totPVs) # Comparison of VI and QL policies: rl.explorationProb = 0 # rerun QL now with 0 exploration prob (since learned) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) #reruns simulation Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... QL: est. Vopt of startState re-run (with eps = 0) : %s " % Vopt_est
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valueIteration = ValueIteration() valueIteration.solve(original_mdp) rl = util.FixedRLAlgorithm(valueIteration.pi) rewards = util.simulate(modified_mdp, rl) print(sum(rewards) / len(rewards)) rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor) rewards = util.simulate(original_mdp, rl, numTrials=30000) rewards = util.simulate(modified_mdp, rl, numTrials=30000) print(sum(rewards) / len(rewards)) # END_YOUR_CODE
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE vi = ValueIteration() vi.solve(original_mdp) rewards = util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000) print "Expected Reward on modified mdp using original mdp policy: %i" % ( float(sum(r for r in rewards)) / len(rewards)) rewards_new = util.simulate( modified_mdp, QLearningAlgorithm(modified_mdp.actions, original_mdp.discount(), featureExtractor, 0.1), 10000) print "Expected Reward on modified mdp using Q Learning: %i" % ( float(sum(r for r in rewards_new)) / len(rewards_new))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valueIterOriginal = util.ValueIteration() valueIterOriginal.solve(original_mdp) fixedRL = util.FixedRLAlgorithm(valueIterOriginal.pi) rewards = util.simulate(modified_mdp, fixedRL) print("Fixed RL") for reward in rewards: print(reward) rewardsFromQ = util.simulate(modified_mdp, QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor)) print('QLearn') for reward in rewardsFromQ: print(reward)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE viter = ValueIteration() viter.solve(original_mdp) fixed_rl = util.FixedRLAlgorithm(viter.pi) print "Expected reward value iteration: ", \ sum(util.simulate(modified_mdp, fixed_rl, numTrials=30000))/30000.0 ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=modified_mdp.discount(), featureExtractor=featureExtractor) print "Expected reward q-learn: ", \ sum(util.simulate(modified_mdp, ql, numTrials=30000))/30000.0
def problem4d(): originalMDP.computeStates() newThresholdMDP.computeStates() vi = ValueIteration() vi.solve(originalMDP) fixedVi = util.FixedRLAlgorithm(vi.pi) vi_reward = util.simulate(newThresholdMDP, fixedVi, numTrials=30000) QL = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), blackjackFeatureExtractor, 0.2) util.simulate(newThresholdMDP, QL, numTrials=30000) QL.explorationProb = 0.0 QLreward = util.simulate(newThresholdMDP, QL, numTrials=1000) print('\n 4d now:') print('Value Iteration Reward:{}'.format( sum(vi_reward) / float(len(vi_reward)))) print('Q-learn Reward:{}'.format(sum(QLreward) / float(len(QLreward))))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE ValIter = ValueIteration() ValIter.solve(original_mdp, .0001) pi_val = ValIter.pi fix_policy = util.FixedRLAlgorithm(pi_val) old_reward = util.simulate(modified_mdp, fix_policy, 30000, 1000, False, False) RL = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), blackjackFeatureExtractor) new_reward = util.simulate(newThresholdMDP, RL, 30000, 1000, False, False) print("Reward from old policy:", sum(old_reward), "\nReward from new QL policy:", sum(new_reward)) pass
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE vi = ValueIteration() vi.solve(original_mdp) fixedRLAlgorithm = util.FixedRLAlgorithm(vi.pi) num_trials = 90 total_rewards = util.simulate(modified_mdp, fixedRLAlgorithm, num_trials) expected_reward_fixed_rl = sum(total_rewards) / len(total_rewards) ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=1, featureExtractor=featureExtractor) util.simulate(modified_mdp, ql, numTrials=30000, maxIterations=1000) ql.explorationProb = 0 total_rewards = util.simulate(modified_mdp, ql, num_trials) expected_reward_ql = sum(total_rewards) / len(total_rewards) print("Expected reward fixed rl: " + str(expected_reward_fixed_rl)) print("Expected reward ql: " + str(expected_reward_ql))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valiter = ValueIteration() valiter.solve(original_mdp) orig_pi = valiter.pi print valiter.pi vi_rl = util.FixedRLAlgorithm(orig_pi) vi_result = util.simulate(modified_mdp, vi_rl, 30000, verbose=False) orig_rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, explorationProb=0.2) orig_result = util.simulate(modified_mdp, orig_rl, 30000, verbose=False) return vi_result, orig_result
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE print ("compare_changed_MDP") # Résolution via Value Iteration # pi_original_mdp vio = util.ValueIteration() vio.solve(original_mdp, .0001) pi_original_mdp = vio.pi # pi computed with value iteration print 'Récompenses original_mdp calculée via VI pour startState : ' print vio.V[original_mdp.startState()] # modified_mdp vim = util.ValueIteration() vim.solve(modified_mdp, .0001) pi_modified_mdp = vim.pi # pi computed with value iteration print 'Récompenses modified_mdp calculée via VI pour startState : ' print vim.V[modified_mdp.startState()] # Exploitation de la stratégie Pi définie sur original_mdp # en l'appliquant via la simulation sur le nouvel mdp modified_mdp fixed_rl = util.FixedRLAlgorithm(pi_original_mdp) # totalRewards = util.simulate(newThresholdMDP, fixed_rl, 30000) totalRewards = util.simulate(modified_mdp, fixed_rl, numTrials=30000, maxIterations=1000, verbose=False, sort=False) print('Moyenne des récompenses sur le nouvel MDP en utilisant la Pi de l\'ancien MDP : ') print sum(totalRewards) / len(totalRewards) # Résolution via Q-Learning sur original_mdp original_mdp.computeStates() rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor, 0.2) totalRewards = util.simulate(original_mdp, rl, 30000) print('Moyenne des récompenses sur l\'ancien MDP en utilisant RL: ') print sum(totalRewards) / len(totalRewards) totalRewards = util.simulate(modified_mdp, rl, 30000) print('Moyenne des récompenses sur le nouveau MDP en utilisant RL entraîné sur l\'ancien: ') print sum(totalRewards) / len(totalRewards)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE vi = util.ValueIteration() vi.solve(original_mdp) summ = 0 events = 0 for i in util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000): summ += i events += 1 print(summ * 1.0 / events) Qlearning = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor) summ = 0 events = 0 for i in util.simulate(modified_mdp, Qlearning, 100): summ += i events += 1 print(summ * 1.0 / events)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE original_mdp.computeStates() # to get the whole State Space of MDP modified_mdp.computeStates() algorithm = ValueIteration() algorithm.solve(original_mdp, .001) # algorithm.solve(modified_mdp, .001) frl = util.FixedRLAlgorithm(algorithm.pi) random.seed(123) totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30) print( "*** Expected return for FixedRLAlgorithm (numTrials=30): %.4f \t***" % (sum(totalRewards) / len(totalRewards))) totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30000) print( "*** Expected return for FixedRLAlgorithm (numTrials=30000): %.4f \t***" % (sum(totalRewards) / len(totalRewards))) random.seed(123) rlQ = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor) totalRewards = util.simulate( mdp=modified_mdp, rl=rlQ, numTrials=30 ) # Model-Free, Simulate 30000 times. After this Q-learning has been learned. print( "*** Expected return for QLearningRLAlgorithm (numTrials=30): %.4f \t ***" % (sum(totalRewards) / len(totalRewards))) totalRewards = util.simulate(mdp=modified_mdp, rl=rlQ, numTrials=29970) print( "*** Expected return for QLearningRLAlgorithm (numTrials=30000): %.4f \t ***" % (sum(totalRewards) / len(totalRewards)))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE value_iteration = ValueIteration() value_iteration.solve(original_mdp) fixed_RL = util.FixedRLAlgorithm(value_iteration.pi) rewards1 = util.simulate(modified_mdp, fixed_RL, numTrials=50000, verbose=False, sort=False) q_learning = QLearningAlgorithm(actions=modified_mdp.actions, discount=modified_mdp.discount(), featureExtractor=featureExtractor) rewards2 = util.simulate(modified_mdp, q_learning, numTrials=50000, verbose=False, sort=False) print('fixed_RL reward :', sum(rewards1) / len(rewards1)) print('q_learning reward :', sum(rewards2) / len(rewards2))
features.append((key3, 1)) return features # END_YOUR_CODE ############################################################ # Problem 4d: changing mdp # Original mdp originalMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) # New threshold newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1) originalMDP.computeStates() algo = ValueIteration() algo.solve(originalMDP) algorithm = util.FixedRLAlgorithm(algo.pi) util.simulate(newThresholdMDP, algorithm) algorithm = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), identityFeatureExtractor, 0.2) util.simulate(newThresholdMDP, algorithm)
feature = [(('totalAndAction', total, action), 1)] if counts is not None: featureKey = map(lambda x: 1 if x != 0 else 0, counts) feature += [(tuple(['cardPresence'] + featureKey + [action]), 1)] for i in range(len(counts)): featureKey = ('cardAndAction', i, counts[i], action) feature += [(featureKey, 1)] return feature # END_YOUR_CODE ############################################################ # Problem 4d: What happens when the MDP changes underneath you?! # Original mdp originalMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) # New threshold newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1) piVI = simulateVI(originalMDP) fixedRL = util.FixedRLAlgorithm(piVI) rewards = util.simulate(newThresholdMDP, fixedRL) print 'Rewards for value iteration in 10 trails:' print rewards newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1) QLAlgorithm = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), identityFeatureExtractor) rewards = util.simulate(newThresholdMDP, QLAlgorithm) print 'Rewards for Q-learning in 10 trails:' print rewards
originalMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) # New threshold newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1) ValueIterationSolution = util.ValueIteration() ValueIterationSolution.solve(originalMDP) policy = ValueIterationSolution.pi rl1 = util.FixedRLAlgorithm(policy) print policy total = 10000 print float( sum( util.simulate(newThresholdMDP, rl1, numTrials=total, maxIterations=1000, verbose=False, sort=False))) / total rl2 = QLearningAlgorithm(originalMDP.actions, newThresholdMDP.discount(), identityFeatureExtractor) print float( sum(
import util import submission vi = submission.ValueIteration() vi.solve(submission.originalMDP) fixedRLA = util.FixedRLAlgorithm(vi.pi) rewards = util.simulate(submission.newThresholdMDP, fixedRLA, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards))) rewards = util.simulate(submission.originalMDP, fixedRLA, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards))) mdp2 = submission.newThresholdMDP learning = submission.QLearningAlgorithm(mdp2.actions, 1, submission.blackjackFeatureExtractor) rewards = util.simulate(mdp2, learning, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards))) vi2 = submission.ValueIteration() vi2.solve(submission.newThresholdMDP) fixed2 = util.FixedRLAlgorithm(vi2.pi) rewards = util.simulate(submission.newThresholdMDP, fixed2, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards)))
# Original mdp originalMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) # New threshold newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1) if __name__ == '__main__' and args.p_4d: # value iteration value = ValueIteration() value.solve(originalMDP) # simulation rl = util.FixedRLAlgorithm(value.pi) simulated = util.simulate(newThresholdMDP, rl, 30000, verbose=False) print sum(simulated) rl = QLearningAlgorithm(originalMDP.actions, originalMDP.discount(), identityFeatureExtractor, 0.2) simulated = util.simulate(newThresholdMDP, rl, 30000, verbose=False) print sum(simulated) rl = QLearningAlgorithm(originalMDP.actions, originalMDP.discount(), identityFeatureExtractor, 0) simulated = util.simulate(newThresholdMDP, rl, 30000, verbose=False) print sum(simulated)
############################################################ # Problem 4d: What happens when the MDP changes underneath you?! # Original mdp originalMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) random.seed(1) algo = ValueIteration() algo.solve(originalMDP) print "pi of Value iteration is:" print algo.pi states = algo.pi.keys() # New threshold newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1) s = util.simulate(newThresholdMDP, util.FixedRLAlgorithm(algo.pi), 30000) s = sum(s) print s s = util.simulate( newThresholdMDP, QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), identityFeatureExtractor, 0.2), 30000) s = sum(s) print s