예제 #1
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE

    ql = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    q_rewards = util.simulate(mdp, ql, 30000)

    avg_reward_q = float(sum(q_rewards)) / len(q_rewards)

    vi = ValueIteration()
    vi.solve(mdp)

    rl = util.FixedRLAlgorithm(vi.pi)
    vi_rewards = util.simulate(mdp, rl, 30000)

    avg_reward_vi = float(sum(vi_rewards)) / len(vi_rewards)

    ql.explorationProb = 0
    ql_pi = {}
    for state, _ in vi.pi.items():
        ql_pi[state] = ql.getAction(state)
    p_vi = vi.pi

    diff = 0
    for state in vi.pi.keys():
        if vi.pi[state] != ql_pi[state]: diff += 1

    print("difference", diff, "over " + str(len(p_vi.keys())) + " states")
    print("percentage diff ", float(diff) / len(p_vi.keys()))
    print("avg_reward_diff", avg_reward_q - avg_reward_vi)
예제 #2
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    original_mdp.computeStates()
    vi = ValueIteration()
    vi.solve(originalMDP)

    rl = util.FixedRLAlgorithm(vi.pi.copy())
    rewards = util.simulate(modified_mdp,
                            rl,
                            numTrials=10000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    rl.explorationProb = 0.0
    #print(rewards)
    modified_mdp.computeStates()
    rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(),
                            featureExtractor, 0.2)

    rewards = util.simulate(modified_mdp,
                            rl,
                            numTrials=10000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
예제 #3
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE

    val = util.ValueIteration()
    val.solve(original_mdp)
    val_policy = val.pi
    RL1 = util.FixedRLAlgorithm(val_policy)
    result1 = util.simulate(modified_mdp,
                            RL1,
                            numTrials=50000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    avg1 = sum(result1) / float(len(result1))
    print(avg1)
    RL2 = QLearningAlgorithm(modified_mdp.actions,
                             modified_mdp.discount(),
                             featureExtractor,
                             explorationProb=0.2)
    result2 = util.simulate(modified_mdp,
                            RL2,
                            numTrials=50000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    avg2 = sum(result2) / float(len(result2))
    print(avg2)
예제 #4
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    solver = util.ValueIteration()
    solver.solve(original_mdp)
    optimalVIOriginalMDPPolicy = solver.pi

    fixedRL = util.FixedRLAlgorithm(optimalVIOriginalMDPPolicy)
    rewards = util.simulate(modified_mdp,
                            fixedRL,
                            numTrials=30000,
                            maxIterations=10000)
    print("Sampled average reward for optimal policy from original MDP is {}.".
          format(sum(rewards) / float(len(rewards))))

    # Train Q-learning.
    ql = QLearningAlgorithm(actions=modified_mdp.actions,
                            discount=modified_mdp.discount(),
                            featureExtractor=featureExtractor)
    trainingRewards = util.simulate(modified_mdp,
                                    ql,
                                    numTrials=30000,
                                    maxIterations=10000)
    print(
        "Sampled average reward for Q-Learning during training is {}.".format(
            sum(trainingRewards) / float(len(trainingRewards))))
    ql.explorationProb = 0.0
    modified_mdp.computeStates()
    learnedQLPolicy = {
        state: ql.getAction(state)
        for state in modified_mdp.states
    }
    fixedQLRL = util.FixedRLAlgorithm(learnedQLPolicy)
    rewardsQL = util.simulate(modified_mdp,
                              fixedQLRL,
                              numTrials=30000,
                              maxIterations=10000)
    print(
        "Sampled average reward for policy learned directly on new problem with Q-Learning is {}."
        .format(sum(rewardsQL) / float(len(rewardsQL))))
예제 #5
0
파일: test_MDP.py 프로젝트: saltyp/mini-AI
def Q4d():
    origMDP = BlackjackMDP(cardValues=[1, 5],
                           multiplicity=2,
                           threshold=10,
                           peekCost=1)
    newThreshMDP = BlackjackMDP(cardValues=[1, 5],
                                multiplicity=2,
                                threshold=9,
                                peekCost=1)

    #run VI on original MDP to obtain policy:
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(origMDP)  #algo applied to the MDP problem
    print " ... VI Vopt(startState) = %s ." % (solver.V[origMDP.startState()])
    pi0 = solver.pi

    # apply this policy to an agent (in simulated mdp) playing the **new** MDP:
    numqtrials = 30000
    rl = util.FixedRLAlgorithm(pi0)

    mdp = origMDP
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    print " ... QL: avg PV, stdev using above VI opt policy on same mdp: ( %s, %s ) " % (
        statistics.mean(totPVs), statistics.stdev(totPVs))

    mdp = newThreshMDP
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    print "\n ... QL: avg PV, stdev using above VI opt policy on *NEW* mdp: ( %s, %s ) " % (
        statistics.mean(totPVs), statistics.stdev(totPVs))

    # now skip the fixed policy and use QL :
    phi = identityFeatureExtractor  #blackjackFeatureExtractor

    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=phi,
                            explorationProb=0.5)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... QL: est. Vopt of startState : %s " % Vopt_est
    # plotQL(totPVs)

    # Comparison of VI and QL policies:
    rl.explorationProb = 0  # rerun QL now with 0 exploration prob (since learned)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials,
                           verbose=False)  #reruns simulation
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... QL: est. Vopt of startState re-run (with eps = 0) : %s " % Vopt_est
예제 #6
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valueIteration = ValueIteration()
    valueIteration.solve(original_mdp)
    rl = util.FixedRLAlgorithm(valueIteration.pi)
    rewards = util.simulate(modified_mdp, rl)
    print(sum(rewards) / len(rewards))
    rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor)
    rewards = util.simulate(original_mdp, rl, numTrials=30000)
    rewards = util.simulate(modified_mdp, rl, numTrials=30000)
    print(sum(rewards) / len(rewards))
    # END_YOUR_CODE
예제 #7
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    vi = ValueIteration()
    vi.solve(original_mdp)
    rewards = util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000)
    print "Expected Reward on modified mdp using original mdp policy: %i" % (
        float(sum(r for r in rewards)) / len(rewards))
    rewards_new = util.simulate(
        modified_mdp,
        QLearningAlgorithm(modified_mdp.actions, original_mdp.discount(),
                           featureExtractor, 0.1), 10000)
    print "Expected Reward on modified mdp using Q Learning: %i" % (
        float(sum(r for r in rewards_new)) / len(rewards_new))
예제 #8
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valueIterOriginal = util.ValueIteration()
    valueIterOriginal.solve(original_mdp)
    fixedRL = util.FixedRLAlgorithm(valueIterOriginal.pi)
    rewards = util.simulate(modified_mdp, fixedRL)
    print("Fixed RL")
    for reward in rewards:
        print(reward)
    rewardsFromQ = util.simulate(modified_mdp, QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor))
    print('QLearn')
    for reward in rewardsFromQ:
        print(reward)
예제 #9
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    viter = ValueIteration()
    viter.solve(original_mdp)

    fixed_rl = util.FixedRLAlgorithm(viter.pi)
    print "Expected reward value iteration: ", \
        sum(util.simulate(modified_mdp, fixed_rl, numTrials=30000))/30000.0

    ql = QLearningAlgorithm(actions=modified_mdp.actions,
                            discount=modified_mdp.discount(),
                            featureExtractor=featureExtractor)
    print "Expected reward q-learn: ", \
            sum(util.simulate(modified_mdp, ql, numTrials=30000))/30000.0
예제 #10
0
def problem4d():
    originalMDP.computeStates()
    newThresholdMDP.computeStates()
    vi = ValueIteration()
    vi.solve(originalMDP)
    fixedVi = util.FixedRLAlgorithm(vi.pi)
    vi_reward = util.simulate(newThresholdMDP, fixedVi, numTrials=30000)

    QL = QLearningAlgorithm(newThresholdMDP.actions,
                            newThresholdMDP.discount(),
                            blackjackFeatureExtractor, 0.2)
    util.simulate(newThresholdMDP, QL, numTrials=30000)
    QL.explorationProb = 0.0
    QLreward = util.simulate(newThresholdMDP, QL, numTrials=1000)
    print('\n 4d now:')
    print('Value Iteration Reward:{}'.format(
        sum(vi_reward) / float(len(vi_reward))))
    print('Q-learn Reward:{}'.format(sum(QLreward) / float(len(QLreward))))
예제 #11
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    ValIter = ValueIteration()
    ValIter.solve(original_mdp, .0001)
    pi_val = ValIter.pi
    fix_policy = util.FixedRLAlgorithm(pi_val)
    old_reward = util.simulate(modified_mdp, fix_policy, 30000, 1000, False,
                               False)
    RL = QLearningAlgorithm(newThresholdMDP.actions,
                            newThresholdMDP.discount(),
                            blackjackFeatureExtractor)
    new_reward = util.simulate(newThresholdMDP, RL, 30000, 1000, False, False)
    print("Reward from old policy:", sum(old_reward),
          "\nReward from new QL policy:", sum(new_reward))
    pass
예제 #12
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    vi = ValueIteration()
    vi.solve(original_mdp)
    fixedRLAlgorithm = util.FixedRLAlgorithm(vi.pi)
    num_trials = 90
    total_rewards = util.simulate(modified_mdp, fixedRLAlgorithm, num_trials)
    expected_reward_fixed_rl = sum(total_rewards) / len(total_rewards)

    ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=1, featureExtractor=featureExtractor)
    util.simulate(modified_mdp, ql, numTrials=30000, maxIterations=1000)
    ql.explorationProb = 0
    total_rewards = util.simulate(modified_mdp, ql, num_trials)
    expected_reward_ql = sum(total_rewards) / len(total_rewards)
    print("Expected reward fixed rl: " + str(expected_reward_fixed_rl))
    print("Expected reward ql: " + str(expected_reward_ql))
예제 #13
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valiter = ValueIteration()
    valiter.solve(original_mdp)
    orig_pi = valiter.pi
    print valiter.pi

    vi_rl = util.FixedRLAlgorithm(orig_pi)
    vi_result = util.simulate(modified_mdp, vi_rl, 30000, verbose=False)

    orig_rl = QLearningAlgorithm(modified_mdp.actions,
                                 modified_mdp.discount(),
                                 featureExtractor,
                                 explorationProb=0.2)
    orig_result = util.simulate(modified_mdp, orig_rl, 30000, verbose=False)

    return vi_result, orig_result
예제 #14
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    print ("compare_changed_MDP")

    # Résolution via Value Iteration
    # pi_original_mdp
    vio = util.ValueIteration()
    vio.solve(original_mdp, .0001)
    pi_original_mdp = vio.pi  # pi computed with value iteration
    print 'Récompenses original_mdp calculée via VI pour startState : '
    print vio.V[original_mdp.startState()]

    # modified_mdp
    vim = util.ValueIteration()
    vim.solve(modified_mdp, .0001)
    pi_modified_mdp = vim.pi  # pi computed with value iteration
    print 'Récompenses modified_mdp calculée via VI pour startState : '
    print vim.V[modified_mdp.startState()]

    # Exploitation de la stratégie Pi définie sur original_mdp
    # en l'appliquant via la simulation sur le nouvel mdp modified_mdp
    fixed_rl = util.FixedRLAlgorithm(pi_original_mdp)
    # totalRewards = util.simulate(newThresholdMDP, fixed_rl, 30000)
    totalRewards = util.simulate(modified_mdp, fixed_rl, numTrials=30000, maxIterations=1000, verbose=False, sort=False)
    print('Moyenne des récompenses sur le nouvel MDP en utilisant la Pi de l\'ancien MDP : ')
    print sum(totalRewards) / len(totalRewards)

    # Résolution via Q-Learning sur original_mdp
    original_mdp.computeStates()
    rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor, 0.2)
    totalRewards = util.simulate(original_mdp, rl, 30000)
    print('Moyenne des récompenses sur l\'ancien MDP en utilisant RL: ')
    print sum(totalRewards) / len(totalRewards)
    totalRewards = util.simulate(modified_mdp, rl, 30000)
    print('Moyenne des récompenses sur le nouveau MDP en utilisant RL entraîné sur l\'ancien: ')
    print sum(totalRewards) / len(totalRewards)
예제 #15
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    vi = util.ValueIteration()
    vi.solve(original_mdp)
    summ = 0
    events = 0
    for i in util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000):
        summ += i
        events += 1
    print(summ * 1.0 / events)
    Qlearning = QLearningAlgorithm(modified_mdp.actions,
                                   modified_mdp.discount(), featureExtractor)
    summ = 0
    events = 0
    for i in util.simulate(modified_mdp, Qlearning, 100):
        summ += i
        events += 1
    print(summ * 1.0 / events)
예제 #16
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    original_mdp.computeStates()  # to get the whole State Space of MDP
    modified_mdp.computeStates()
    algorithm = ValueIteration()
    algorithm.solve(original_mdp, .001)
    # algorithm.solve(modified_mdp, .001)

    frl = util.FixedRLAlgorithm(algorithm.pi)

    random.seed(123)
    totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30)
    print(
        "*** Expected return for FixedRLAlgorithm (numTrials=30): %.4f \t***" %
        (sum(totalRewards) / len(totalRewards)))
    totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30000)
    print(
        "*** Expected return for FixedRLAlgorithm (numTrials=30000): %.4f \t***"
        % (sum(totalRewards) / len(totalRewards)))
    random.seed(123)
    rlQ = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(),
                             featureExtractor)
    totalRewards = util.simulate(
        mdp=modified_mdp, rl=rlQ, numTrials=30
    )  # Model-Free, Simulate 30000 times. After this Q-learning has been learned.
    print(
        "*** Expected return for QLearningRLAlgorithm (numTrials=30): %.4f \t ***"
        % (sum(totalRewards) / len(totalRewards)))
    totalRewards = util.simulate(mdp=modified_mdp, rl=rlQ, numTrials=29970)
    print(
        "*** Expected return for QLearningRLAlgorithm (numTrials=30000): %.4f \t ***"
        % (sum(totalRewards) / len(totalRewards)))
예제 #17
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    value_iteration = ValueIteration()
    value_iteration.solve(original_mdp)
    fixed_RL = util.FixedRLAlgorithm(value_iteration.pi)
    rewards1 = util.simulate(modified_mdp,
                             fixed_RL,
                             numTrials=50000,
                             verbose=False,
                             sort=False)
    q_learning = QLearningAlgorithm(actions=modified_mdp.actions,
                                    discount=modified_mdp.discount(),
                                    featureExtractor=featureExtractor)
    rewards2 = util.simulate(modified_mdp,
                             q_learning,
                             numTrials=50000,
                             verbose=False,
                             sort=False)
    print('fixed_RL reward :', sum(rewards1) / len(rewards1))
    print('q_learning reward :', sum(rewards2) / len(rewards2))
예제 #18
0
            features.append((key3, 1))

    return features
    # END_YOUR_CODE


############################################################
# Problem 4d: changing mdp

# Original mdp
originalMDP = BlackjackMDP(cardValues=[1, 5],
                           multiplicity=2,
                           threshold=10,
                           peekCost=1)
# New threshold
newThresholdMDP = BlackjackMDP(cardValues=[1, 5],
                               multiplicity=2,
                               threshold=15,
                               peekCost=1)

originalMDP.computeStates()
algo = ValueIteration()
algo.solve(originalMDP)

algorithm = util.FixedRLAlgorithm(algo.pi)
util.simulate(newThresholdMDP, algorithm)
algorithm = QLearningAlgorithm(newThresholdMDP.actions,
                               newThresholdMDP.discount(),
                               identityFeatureExtractor, 0.2)
util.simulate(newThresholdMDP, algorithm)
예제 #19
0
    feature = [(('totalAndAction', total, action), 1)]
    if counts is not None:
        featureKey = map(lambda x: 1 if x != 0 else 0, counts)
        feature += [(tuple(['cardPresence'] + featureKey + [action]), 1)]
        for i in range(len(counts)):
            featureKey = ('cardAndAction', i, counts[i], action)
            feature += [(featureKey, 1)]
    return feature
    # END_YOUR_CODE

############################################################
# Problem 4d: What happens when the MDP changes underneath you?!

# Original mdp
originalMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)

# New threshold
newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1)

piVI = simulateVI(originalMDP)
fixedRL = util.FixedRLAlgorithm(piVI)
rewards = util.simulate(newThresholdMDP, fixedRL)
print 'Rewards for value iteration in 10 trails:'
print rewards

newThresholdMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1)
QLAlgorithm = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), identityFeatureExtractor)
rewards = util.simulate(newThresholdMDP, QLAlgorithm)
print 'Rewards for Q-learning in 10 trails:'
print rewards
예제 #20
0
originalMDP = BlackjackMDP(cardValues=[1, 5],
                           multiplicity=2,
                           threshold=10,
                           peekCost=1)

# New threshold
newThresholdMDP = BlackjackMDP(cardValues=[1, 5],
                               multiplicity=2,
                               threshold=15,
                               peekCost=1)

ValueIterationSolution = util.ValueIteration()
ValueIterationSolution.solve(originalMDP)
policy = ValueIterationSolution.pi

rl1 = util.FixedRLAlgorithm(policy)
print policy
total = 10000
print float(
    sum(
        util.simulate(newThresholdMDP,
                      rl1,
                      numTrials=total,
                      maxIterations=1000,
                      verbose=False,
                      sort=False))) / total

rl2 = QLearningAlgorithm(originalMDP.actions, newThresholdMDP.discount(),
                         identityFeatureExtractor)
print float(
    sum(
예제 #21
0
import util
import submission

vi = submission.ValueIteration()
vi.solve(submission.originalMDP)
fixedRLA = util.FixedRLAlgorithm(vi.pi)
rewards = util.simulate(submission.newThresholdMDP, fixedRLA, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))
rewards = util.simulate(submission.originalMDP, fixedRLA, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))

mdp2 = submission.newThresholdMDP
learning = submission.QLearningAlgorithm(mdp2.actions, 1,
                                         submission.blackjackFeatureExtractor)
rewards = util.simulate(mdp2, learning, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))
vi2 = submission.ValueIteration()
vi2.solve(submission.newThresholdMDP)
fixed2 = util.FixedRLAlgorithm(vi2.pi)
rewards = util.simulate(submission.newThresholdMDP, fixed2, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))
예제 #22
0
# Original mdp
originalMDP = BlackjackMDP(cardValues=[1, 5],
                           multiplicity=2,
                           threshold=10,
                           peekCost=1)

# New threshold
newThresholdMDP = BlackjackMDP(cardValues=[1, 5],
                               multiplicity=2,
                               threshold=15,
                               peekCost=1)

if __name__ == '__main__' and args.p_4d:
    # value iteration
    value = ValueIteration()
    value.solve(originalMDP)
    # simulation
    rl = util.FixedRLAlgorithm(value.pi)
    simulated = util.simulate(newThresholdMDP, rl, 30000, verbose=False)
    print sum(simulated)

    rl = QLearningAlgorithm(originalMDP.actions, originalMDP.discount(),
                            identityFeatureExtractor, 0.2)
    simulated = util.simulate(newThresholdMDP, rl, 30000, verbose=False)
    print sum(simulated)

    rl = QLearningAlgorithm(originalMDP.actions, originalMDP.discount(),
                            identityFeatureExtractor, 0)
    simulated = util.simulate(newThresholdMDP, rl, 30000, verbose=False)
    print sum(simulated)
예제 #23
0

############################################################
# Problem 4d: What happens when the MDP changes underneath you?!

# Original mdp
originalMDP = BlackjackMDP(cardValues=[1, 5],
                           multiplicity=2,
                           threshold=10,
                           peekCost=1)
random.seed(1)
algo = ValueIteration()
algo.solve(originalMDP)
print "pi of Value iteration is:"
print algo.pi
states = algo.pi.keys()
# New threshold
newThresholdMDP = BlackjackMDP(cardValues=[1, 5],
                               multiplicity=2,
                               threshold=15,
                               peekCost=1)
s = util.simulate(newThresholdMDP, util.FixedRLAlgorithm(algo.pi), 30000)
s = sum(s)
print s
s = util.simulate(
    newThresholdMDP,
    QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(),
                       identityFeatureExtractor, 0.2), 30000)
s = sum(s)
print s