Пример #1
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    valiter = ValueIteration()
    valiter.solve(smallMDP)
    # Simulate with 20% exploration probability, and then set to 0 after simulation
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, explorationProb = 0.2)
    util.simulate(mdp, rl, 30000, verbose = False)
    rl.explorationProb = 0
    # Extract the optimal policies and replicate the dict that comes from valiter.pi
    rl_result = dict()
    same = 0
    different = 0
    for state in valiter.pi.keys():
        rl_result[state] = rl.getAction(state)
        print rl.getAction(state), valiter.pi[state]
        if rl.getAction(state) == valiter.pi[state]:
            same = same + 1
        else: 
            different = different + 1

    print same, different

    return valiter.pi, rl_result    
Пример #2
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    ql = QLearningAlgorithm(actions=mdp.actions, discount=1, featureExtractor=featureExtractor)
    util.simulate(mdp, ql, numTrials=90000, maxIterations=1000)
    print(ql.numIters)
    ql.explorationProb = 0
    print(ql.explorationProb)
    ql.is_test = True
    vi = ValueIteration()
    vi.solve(mdp)
    match = [ql.getAction(state) == action for state, action in vi.pi.items()]
    # ql_action = [ql.getAction(state) for state, action in vi.pi.items()]
    # take_count = [action == 'Take' for state, action in vi.pi.items()]
    # peek_count = [action == 'Peek' for state, action in vi.pi.items()]
    # quit_count = [action == 'Quit' for state, action in vi.pi.items()]
    # print('Take: {}'.format(sum(take_count) / len(take_count)))
    # print('Peek: {}'.format(sum(peek_count) / len(take_count)))
    # print('Quit: {}'.format(sum(quit_count) / len(take_count)))
    percentage_match = sum(match) / len(match)
    # print(ql_action)
    # print(ql.weights)
    return percentage_match 
Пример #3
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    original_mdp.computeStates()
    vi = ValueIteration()
    vi.solve(originalMDP)

    rl = util.FixedRLAlgorithm(vi.pi.copy())
    rewards = util.simulate(modified_mdp,
                            rl,
                            numTrials=10000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    rl.explorationProb = 0.0
    #print(rewards)
    modified_mdp.computeStates()
    rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(),
                            featureExtractor, 0.2)

    rewards = util.simulate(modified_mdp,
                            rl,
                            numTrials=10000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
Пример #4
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE

    ql = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    q_rewards = util.simulate(mdp, ql, 30000)

    avg_reward_q = float(sum(q_rewards)) / len(q_rewards)

    vi = ValueIteration()
    vi.solve(mdp)

    rl = util.FixedRLAlgorithm(vi.pi)
    vi_rewards = util.simulate(mdp, rl, 30000)

    avg_reward_vi = float(sum(vi_rewards)) / len(vi_rewards)

    ql.explorationProb = 0
    ql_pi = {}
    for state, _ in vi.pi.items():
        ql_pi[state] = ql.getAction(state)
    p_vi = vi.pi

    diff = 0
    for state in vi.pi.keys():
        if vi.pi[state] != ql_pi[state]: diff += 1

    print("difference", diff, "over " + str(len(p_vi.keys())) + " states")
    print("percentage diff ", float(diff) / len(p_vi.keys()))
    print("avg_reward_diff", avg_reward_q - avg_reward_vi)
Пример #5
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    util.simulate(mdp, rl, 30000)
    zero_weight_count = 0
    total_weight_count = 0
    for key in rl.weights:
        weight = rl.weights[key]
        total_weight_count += 1
        if abs(weight - 0.0) <= 0.00001: zero_weight_count += 1
    print "Total Weights: %s, Zero Weights: %s" % (total_weight_count,
                                                   zero_weight_count)
    rl.explorationProb = 0
    vi = ValueIteration()
    vi.solve(mdp)
    count = 0
    expected_result = 0
    for key in vi.pi:
        count += 1
        if vi.pi[key] is rl.getAction(key):
            expected_result += 1
    print "total (state, action) pairs: %s" % (count * 3)
    print "Accuracy of MDP using the featureExtractor: %s" % (
        float(expected_result) / count * 100)
Пример #6
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    vi = ValueIteration()
    vi.solve(mdp)
    viQ = vi.pi

    mdp.computeStates()
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(),
                            identityFeatureExtractor, .2)
    util.simulate(mdp,
                  rl,
                  numTrials=30000,
                  maxIterations=10,
                  verbose=False,
                  sort=False)
    mdp.explorationProb = 0
    d = {}
    for state in mdp.states:
        d[state] = rl.getAction(state)

    Diff = 0
    for k in d.keys():
        if d[k] != viQ[k]:
            Diff += 1
    return Diff
Пример #7
0
def testValueIteration(mdp):
    valueIter = ValueIteration() # implemented in util.py
    valueIter.solve(mdp, .001)
    states = sorted(valueIter.pi, key=lambda x: len(x)) # sorted by state space
    
    print('valueIter.pi:')
    for elem in sorted(valueIter.pi):
        print(elem, '\t:\t', valueIter.pi[elem])
        
    return valueIter
def compareQLandVI(targetMDP, featureExtractor):
    QL = QLearningAlgorithm(targetMDP.actions, 1, featureExtractor)
    VI = ValueIteration()
    
    simulate(targetMDP, QL, numTrials=30000)
    VI.solve(targetMDP)

    diffPolicyStates = []
    QL.explorationProb = 0
    for state in targetMDP.states:
        #print state, QL.getAction(state), VI.pi[state]
        if QL.getAction(state) != VI.pi[state]:
            diffPolicyStates.append(state)
    print "%d/%d = %f%% different states"%(len(diffPolicyStates), len(targetMDP.states), len(diffPolicyStates)/float(len(targetMDP.states)))
Пример #9
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valueIteration = ValueIteration()
    valueIteration.solve(original_mdp)
    rl = util.FixedRLAlgorithm(valueIteration.pi)
    rewards = util.simulate(modified_mdp, rl)
    print(sum(rewards) / len(rewards))
    rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor)
    rewards = util.simulate(original_mdp, rl, numTrials=30000)
    rewards = util.simulate(modified_mdp, rl, numTrials=30000)
    print(sum(rewards) / len(rewards))
    # END_YOUR_CODE
Пример #10
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE

    # for the reproductivity
    random.seed(123)
    # initialization
    mdp.computeStates()  # to get the whole State Space of MDP
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, 0.2)
    # Value Iteration part
    algorithm = ValueIteration()
    algorithm.solve(
        mdp, .001)  # algorithm now contains the Value and Policy of the mdp.
    # Q-Learning part
    util.simulate(
        mdp, rl, 30000
    )  # Model-Free, Simulate 30000 times. After this Q-learning has been learned.
    rl.explorationProb = 0  # set ε to 0 and then .getAction(state) works as a policy Π.
    Qpi = {}
    comparison = []
    for state in mdp.states:  # get the Q-learning policy and comparison results.
        Qpi[state] = rl.getAction(state)
        comparison.append(int(algorithm.pi[state] == Qpi[state]))
    if featureExtractor == identityFeatureExtractor:
        if mdp.multiplicity == 2:
            print(
                "The match rate of using identityFeatureExtractor for smallMDP: %.4f"
                % (sum(comparison) / len(comparison)))
            print(
                "Number of different actions: %d  Number of total actions: %d"
                % (len(comparison) - sum(comparison), len(comparison)))
        else:
            print(
                "The match rate of using identityFeatureExtractor for largeMDP: %.4f"
                % (sum(comparison) / len(comparison)))
            print(
                "Number of different actions: %d  Number of total actions: %d"
                % (len(comparison) - sum(comparison), len(comparison)))
    else:
        print(
            "The match rate of using blackjackFeatureExtractor for largeMDP: %.4f"
            % (sum(comparison) / len(comparison)))
        print("Number of different actions: %d  Number of total actions: %d" %
              (len(comparison) - sum(comparison), len(comparison)))
Пример #11
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    vi = ValueIteration()
    vi.solve(original_mdp)
    rewards = util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000)
    print "Expected Reward on modified mdp using original mdp policy: %i" % (
        float(sum(r for r in rewards)) / len(rewards))
    rewards_new = util.simulate(
        modified_mdp,
        QLearningAlgorithm(modified_mdp.actions, original_mdp.discount(),
                           featureExtractor, 0.1), 10000)
    print "Expected Reward on modified mdp using Q Learning: %i" % (
        float(sum(r for r in rewards_new)) / len(rewards_new))
Пример #12
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    valueIteration = ValueIteration()
    valueIteration.solve(mdp)
    vi_pi = valueIteration.pi
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    util.simulate(mdp, rl, numTrials=30000, verbose=False)
    rl.explorationProb = 0
    diff, total = 0, len(mdp.states)
    for state in mdp.states:
        if vi_pi[state] != rl.getAction(state):
            diff += 1
    print('{:.3f}'.format(100 * diff / total) + '%')
Пример #13
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    viter = ValueIteration()
    viter.solve(original_mdp)

    fixed_rl = util.FixedRLAlgorithm(viter.pi)
    print "Expected reward value iteration: ", \
        sum(util.simulate(modified_mdp, fixed_rl, numTrials=30000))/30000.0

    ql = QLearningAlgorithm(actions=modified_mdp.actions,
                            discount=modified_mdp.discount(),
                            featureExtractor=featureExtractor)
    print "Expected reward q-learn: ", \
            sum(util.simulate(modified_mdp, ql, numTrials=30000))/30000.0
Пример #14
0
def problem4d():
    originalMDP.computeStates()
    newThresholdMDP.computeStates()
    vi = ValueIteration()
    vi.solve(originalMDP)
    fixedVi = util.FixedRLAlgorithm(vi.pi)
    vi_reward = util.simulate(newThresholdMDP, fixedVi, numTrials=30000)

    QL = QLearningAlgorithm(newThresholdMDP.actions,
                            newThresholdMDP.discount(),
                            blackjackFeatureExtractor, 0.2)
    util.simulate(newThresholdMDP, QL, numTrials=30000)
    QL.explorationProb = 0.0
    QLreward = util.simulate(newThresholdMDP, QL, numTrials=1000)
    print('\n 4d now:')
    print('Value Iteration Reward:{}'.format(
        sum(vi_reward) / float(len(vi_reward))))
    print('Q-learn Reward:{}'.format(sum(QLreward) / float(len(QLreward))))
Пример #15
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    ValIter = ValueIteration()
    ValIter.solve(original_mdp, .0001)
    pi_val = ValIter.pi
    fix_policy = util.FixedRLAlgorithm(pi_val)
    old_reward = util.simulate(modified_mdp, fix_policy, 30000, 1000, False,
                               False)
    RL = QLearningAlgorithm(newThresholdMDP.actions,
                            newThresholdMDP.discount(),
                            blackjackFeatureExtractor)
    new_reward = util.simulate(newThresholdMDP, RL, 30000, 1000, False, False)
    print("Reward from old policy:", sum(old_reward),
          "\nReward from new QL policy:", sum(new_reward))
    pass
Пример #16
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    vi = ValueIteration()
    vi.solve(original_mdp)
    fixedRLAlgorithm = util.FixedRLAlgorithm(vi.pi)
    num_trials = 90
    total_rewards = util.simulate(modified_mdp, fixedRLAlgorithm, num_trials)
    expected_reward_fixed_rl = sum(total_rewards) / len(total_rewards)

    ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=1, featureExtractor=featureExtractor)
    util.simulate(modified_mdp, ql, numTrials=30000, maxIterations=1000)
    ql.explorationProb = 0
    total_rewards = util.simulate(modified_mdp, ql, num_trials)
    expected_reward_ql = sum(total_rewards) / len(total_rewards)
    print("Expected reward fixed rl: " + str(expected_reward_fixed_rl))
    print("Expected reward ql: " + str(expected_reward_ql))
Пример #17
0
def compare(mdp):
    mdp.computeStates()
    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=identityFeatureExtractor)
    util.simulate(mdp, rl, 30000)

    rl.explorationProb = 0.0
    QLearnPolicy = {}
    for state in mdp.states:
        QLearnPolicy[state] = rl.getAction(state)

    vi = ValueIteration()
    vi.solve(mdp, )

    matchCount = 0
    for state in mdp.states:
        if QLearnPolicy[state] == vi.pi[state]:
            matchCount += 1
    print('policy match:{}/{}'.format(matchCount, len(mdp.states)))
Пример #18
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valiter = ValueIteration()
    valiter.solve(original_mdp)
    orig_pi = valiter.pi
    print valiter.pi

    vi_rl = util.FixedRLAlgorithm(orig_pi)
    vi_result = util.simulate(modified_mdp, vi_rl, 30000, verbose=False)

    orig_rl = QLearningAlgorithm(modified_mdp.actions,
                                 modified_mdp.discount(),
                                 featureExtractor,
                                 explorationProb=0.2)
    orig_result = util.simulate(modified_mdp, orig_rl, 30000, verbose=False)

    return vi_result, orig_result
Пример #19
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    value_iteration = ValueIteration()
    value_iteration.solve(mdp)
    q_learning = QLearningAlgorithm(actions=mdp.actions,
                                    discount=mdp.discount(),
                                    featureExtractor=featureExtractor)
    q_learning.explorationProb = 0.0
    util.simulate(mdp, q_learning, numTrials=30000, verbose=False, sort=False)
    total = 0
    diff = 0
    for state in mdp.states:
        if (value_iteration.pi[state] != q_learning.getAction(state)):
            diff += 1
        total += 1
    print('different-ratio between q learning and value iteration:')
    print(diff / total)
Пример #20
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    ValIter = ValueIteration()
    ValIter.solve(mdp, .0001)
    pi_val = ValIter.pi
    RL = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    util.simulate(mdp, RL, 30000, 1000, False, False)
    RL.explorationProb = 0
    pi_RL = {}
    diff = 0
    for state in mdp.states:
        pi_RL[state] = RL.getAction(state)
        if pi_RL[state] != pi_val[state]:
            diff += 1
    ratio = diff / len(mdp.states)
    print("Different policies:", diff, "\tRatio:", ratio)
    pass
Пример #21
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    original_mdp.computeStates()  # to get the whole State Space of MDP
    modified_mdp.computeStates()
    algorithm = ValueIteration()
    algorithm.solve(original_mdp, .001)
    # algorithm.solve(modified_mdp, .001)

    frl = util.FixedRLAlgorithm(algorithm.pi)

    random.seed(123)
    totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30)
    print(
        "*** Expected return for FixedRLAlgorithm (numTrials=30): %.4f \t***" %
        (sum(totalRewards) / len(totalRewards)))
    totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30000)
    print(
        "*** Expected return for FixedRLAlgorithm (numTrials=30000): %.4f \t***"
        % (sum(totalRewards) / len(totalRewards)))
    random.seed(123)
    rlQ = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(),
                             featureExtractor)
    totalRewards = util.simulate(
        mdp=modified_mdp, rl=rlQ, numTrials=30
    )  # Model-Free, Simulate 30000 times. After this Q-learning has been learned.
    print(
        "*** Expected return for QLearningRLAlgorithm (numTrials=30): %.4f \t ***"
        % (sum(totalRewards) / len(totalRewards)))
    totalRewards = util.simulate(mdp=modified_mdp, rl=rlQ, numTrials=29970)
    print(
        "*** Expected return for QLearningRLAlgorithm (numTrials=30000): %.4f \t ***"
        % (sum(totalRewards) / len(totalRewards)))
Пример #22
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=featureExtractor)
    util.simulate(mdp, rl, numTrials=30000)
    viter = ValueIteration()
    viter.solve(mdp, .001)

    rl.explorationProb = 0

    total = 0
    neq = 0
    for state in viter.pi:
        if viter.pi[state] != rl.getAction(state):
            # print state, viter.pi[state], rl.getAction(state)
            neq += 1

    print "Total:", len(
        viter.pi), ", neq:", neq, ", frac_neq:", neq / float(len(viter.pi))
Пример #23
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    value_iteration = ValueIteration()
    value_iteration.solve(original_mdp)
    fixed_RL = util.FixedRLAlgorithm(value_iteration.pi)
    rewards1 = util.simulate(modified_mdp,
                             fixed_RL,
                             numTrials=50000,
                             verbose=False,
                             sort=False)
    q_learning = QLearningAlgorithm(actions=modified_mdp.actions,
                                    discount=modified_mdp.discount(),
                                    featureExtractor=featureExtractor)
    rewards2 = util.simulate(modified_mdp,
                             q_learning,
                             numTrials=50000,
                             verbose=False,
                             sort=False)
    print('fixed_RL reward :', sum(rewards1) / len(rewards1))
    print('q_learning reward :', sum(rewards2) / len(rewards2))
Пример #24
0
# Large test case
largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10],
                        multiplicity=3,
                        threshold=40,
                        peekCost=1)
largeMDP.computeStates()

if __name__ == '__main__' and args.p_4b == 'small':
    rl = QLearningAlgorithm(smallMDP.actions, smallMDP.discount(),
                            identityFeatureExtractor, 0.2)
    simulated = util.simulate(smallMDP, rl, 30000, verbose=False)
    rl.explorationProb = 0

    # value iteration
    value = ValueIteration()
    value.solve(smallMDP)

    for key in value.pi.keys():
        print 'state:', key
        print 'valit:', value.pi[key]
        print 'RLalg:', rl.getAction(key)
        print '-------------------------'

if __name__ == '__main__' and args.p_4b == 'large':
    rl = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(),
                            identityFeatureExtractor, 0.2)
    simulated = util.simulate(largeMDP, rl, 30000, verbose=False)
    rl.explorationProb = 0

    # value iteration
Пример #25
0
        for takenClass, _ in state[0]:
            if takenClass in action:
                return []

        succs = []
        classesDict = {
        }  #will contain each class in |action| with a grade associated
        recurse(state[0], 1, action, succs, classesDict)
        return succs

    def discount(self):
        return 1


bulletin = json.loads(open('cartadata.json').read())
#Run  Value Iteration"
startState = simpleEnroll(bulletin)
# startState = ((("MATH 19", "A"), ("MATH 20", "A"), ("CS 106B","B+"),("MATH 21", "A"),("ECON 1","A"),("MATH 51", "A-"),("CS 107", "A-"),
#                 ("CS 106A", "A"), ("CS 109", "A")),
#                  "Aut", 2, ())

courseMDP = CourseMDP(startState, bulletin)
vi = ValueIteration()
vi.solve(courseMDP)
pi_vi = vi.pi
value = vi.V
bestAction = pi_vi[startState]

#Print our recommended schedule!
print("Best Action: ", bestAction)
Пример #26
0
def simulateVI(mdp):
    VIAlgorithm = ValueIteration()
    VIAlgorithm.solve(mdp)
    return VIAlgorithm.pi
Пример #27
0
# Small test case
mdp1 = BlackjackMDP(cardValues=[1, 5],
                    multiplicity=2,
                    threshold=10,
                    peekCost=1)
rl = QLearningAlgorithm(mdp1.actions, mdp1.discount(),
                        identityFeatureExtractor, 0.2)

mdp1 = BlackjackMDP(cardValues=[1, 5],
                    multiplicity=2,
                    threshold=10,
                    peekCost=1)

random.seed(1)
startState = mdp1.startState()
algo = ValueIteration()
algo.solve(mdp1)
print "pi of Value iteration is:"
#print algo.pi
states = algo.pi.keys()

util.simulate(mdp1, rl, 30000)
rl.explorationProb = 0
pi_rl = {}
for state in states:
    pi_rl[state] = rl.getAction(state)

print "small test case"
#print "pi of reinforcement learning is:"
#print pi_rl