Exemplo n.º 1
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    util.simulate(mdp, rl, 30000)
    zero_weight_count = 0
    total_weight_count = 0
    for key in rl.weights:
        weight = rl.weights[key]
        total_weight_count += 1
        if abs(weight - 0.0) <= 0.00001: zero_weight_count += 1
    print "Total Weights: %s, Zero Weights: %s" % (total_weight_count,
                                                   zero_weight_count)
    rl.explorationProb = 0
    vi = ValueIteration()
    vi.solve(mdp)
    count = 0
    expected_result = 0
    for key in vi.pi:
        count += 1
        if vi.pi[key] is rl.getAction(key):
            expected_result += 1
    print "total (state, action) pairs: %s" % (count * 3)
    print "Accuracy of MDP using the featureExtractor: %s" % (
        float(expected_result) / count * 100)
Exemplo n.º 2
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    original_mdp.computeStates()
    vi = ValueIteration()
    vi.solve(originalMDP)

    rl = util.FixedRLAlgorithm(vi.pi.copy())
    rewards = util.simulate(modified_mdp,
                            rl,
                            numTrials=10000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    rl.explorationProb = 0.0
    #print(rewards)
    modified_mdp.computeStates()
    rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(),
                            featureExtractor, 0.2)

    rewards = util.simulate(modified_mdp,
                            rl,
                            numTrials=10000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
Exemplo n.º 3
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    ql = QLearningAlgorithm(actions=mdp.actions, discount=1, featureExtractor=featureExtractor)
    util.simulate(mdp, ql, numTrials=90000, maxIterations=1000)
    print(ql.numIters)
    ql.explorationProb = 0
    print(ql.explorationProb)
    ql.is_test = True
    vi = ValueIteration()
    vi.solve(mdp)
    match = [ql.getAction(state) == action for state, action in vi.pi.items()]
    # ql_action = [ql.getAction(state) for state, action in vi.pi.items()]
    # take_count = [action == 'Take' for state, action in vi.pi.items()]
    # peek_count = [action == 'Peek' for state, action in vi.pi.items()]
    # quit_count = [action == 'Quit' for state, action in vi.pi.items()]
    # print('Take: {}'.format(sum(take_count) / len(take_count)))
    # print('Peek: {}'.format(sum(peek_count) / len(take_count)))
    # print('Quit: {}'.format(sum(quit_count) / len(take_count)))
    percentage_match = sum(match) / len(match)
    # print(ql_action)
    # print(ql.weights)
    return percentage_match 
Exemplo n.º 4
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    vi = ValueIteration()
    vi.solve(mdp)
    viQ = vi.pi

    mdp.computeStates()
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(),
                            identityFeatureExtractor, .2)
    util.simulate(mdp,
                  rl,
                  numTrials=30000,
                  maxIterations=10,
                  verbose=False,
                  sort=False)
    mdp.explorationProb = 0
    d = {}
    for state in mdp.states:
        d[state] = rl.getAction(state)

    Diff = 0
    for k in d.keys():
        if d[k] != viQ[k]:
            Diff += 1
    return Diff
Exemplo n.º 5
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    valiter = ValueIteration()
    valiter.solve(smallMDP)
    # Simulate with 20% exploration probability, and then set to 0 after simulation
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, explorationProb = 0.2)
    util.simulate(mdp, rl, 30000, verbose = False)
    rl.explorationProb = 0
    # Extract the optimal policies and replicate the dict that comes from valiter.pi
    rl_result = dict()
    same = 0
    different = 0
    for state in valiter.pi.keys():
        rl_result[state] = rl.getAction(state)
        print rl.getAction(state), valiter.pi[state]
        if rl.getAction(state) == valiter.pi[state]:
            same = same + 1
        else: 
            different = different + 1

    print same, different

    return valiter.pi, rl_result    
Exemplo n.º 6
0
def simulaMDP(mdp, extractor, explorationProb):
    value_iterator = ValueIteration()
    value_iterator.solve(mdp)
    policyVi = value_iterator.pi
    mdp.computeStates()

    qLearning = QLearningAlgorithm(mdp.actions, mdp.discount(), extractor, explorationProb)
    util.simulate(mdp,qLearning, 30000, 10, False, False)
    mdp.explorationProb = 0
    actionsQ = {}

    for state in mdp.states:
        actionsQ[state] = qLearning.getAction(state)

    differentActions = 0
    for state in actionsQ.keys():
        if actionsQ[state] != policyVi[state]:
            differentActions += 1
    return differentActions


# SIMULAÇÕES (Descomentar para testar)
# 1 - MDP1 com expProb 0.2
# print("Diferença em MDP1:", simulaMDP(MDP1, identityFeatureExtractor, 0.2))

# 2 - largeMDP com expProb 0
# print("Diferença em largeMDP:", simulaMDP(largeMDP, identityFeatureExtractor, 0))

# 3 - largeMDP com expProb 0 e blackJackFeature
# print("Diferença em largeMDP:", simulaMDP(largeMDP, blackjackFeatureExtractor, 0.2))

# 4 - largeMDP com expProb 0.2 e blackJackFeature
# print("Diferença em largeMDP:", simulaMDP(largeMDP, blackjackFeatureExtractor, 0.2))
Exemplo n.º 7
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) #actions discount feature extractor
    util.simulate(mdp, rl, numTrials=30000)
    rl.explorationProb = 0
    valueIter = util.ValueIteration()
    valueIter.solve(mdp)


    numberOfStates = 0
    numberOfDifferentStates = 0
    for state in mdp.states:
        if state not in valueIter.pi:
            file.write('Pi does not contain state {}\n'.format(state))
        else:
            if valueIter.pi[state] != rl.getAction(state) and state[2] != None:
                numberOfDifferentStates += 1
                file.write('In state {} Pi gives action {}, but RL gives action {}\n'.format(state, valueIter.pi[state], rl.getAction(state)))
        numberOfStates += 1
    file.write('\n % of different actions = {}%\n'.format(numberOfDifferentStates/numberOfStates*100))
    for weight in rl.weights:
        file.write('weight ({}) =  {} \n'.format(weight, rl.weights[weight]))
Exemplo n.º 8
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE

    val = util.ValueIteration()
    val.solve(original_mdp)
    val_policy = val.pi
    RL1 = util.FixedRLAlgorithm(val_policy)
    result1 = util.simulate(modified_mdp,
                            RL1,
                            numTrials=50000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    avg1 = sum(result1) / float(len(result1))
    print(avg1)
    RL2 = QLearningAlgorithm(modified_mdp.actions,
                             modified_mdp.discount(),
                             featureExtractor,
                             explorationProb=0.2)
    result2 = util.simulate(modified_mdp,
                            RL2,
                            numTrials=50000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    avg2 = sum(result2) / float(len(result2))
    print(avg2)
Exemplo n.º 9
0
def simulate_QL_over_MDP(MDP, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    # pass
    RL = QLearningAlgorithm(MDP.actions,
                            MDP.discount(),
                            featureExtractor,
                            explorationProb=0)
    util.simulate(MDP,
                  RL,
                  numTrials=30000,
                  maxIterations=1000,
                  verbose=False,
                  sort=False)
    MDP.computeStates()
    RL_policy = {}
    for state in MDP.states:
        RL_policy[state] = RL.getAction(state)
    val = util.ValueIteration()
    val.solve(MDP)
    val_policy = val.pi
    sum_ = []
    for key in RL_policy:
        if RL_policy[key] == val_policy[key]:
            sum_.append(1)
        else:
            sum_.append(0)
    print(float(sum(sum_)) / len(RL_policy))
    return RL_policy, val_policy
Exemplo n.º 10
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE

    ql = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    q_rewards = util.simulate(mdp, ql, 30000)

    avg_reward_q = float(sum(q_rewards)) / len(q_rewards)

    vi = ValueIteration()
    vi.solve(mdp)

    rl = util.FixedRLAlgorithm(vi.pi)
    vi_rewards = util.simulate(mdp, rl, 30000)

    avg_reward_vi = float(sum(vi_rewards)) / len(vi_rewards)

    ql.explorationProb = 0
    ql_pi = {}
    for state, _ in vi.pi.items():
        ql_pi[state] = ql.getAction(state)
    p_vi = vi.pi

    diff = 0
    for state in vi.pi.keys():
        if vi.pi[state] != ql_pi[state]: diff += 1

    print("difference", diff, "over " + str(len(p_vi.keys())) + " states")
    print("percentage diff ", float(diff) / len(p_vi.keys()))
    print("avg_reward_diff", avg_reward_q - avg_reward_vi)
Exemplo n.º 11
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    mdp.computeStates()
    allStates = mdp.states
    # Run value iteration.
    solver = util.ValueIteration()
    solver.solve(mdp)
    optimalVIPolicy = solver.pi

    # Run Q-Learning algorithm and compute its optimal policy.
    ql = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=featureExtractor)
    util.simulate(smallMDP, ql, numTrials=30000, maxIterations=10000)
    ql.explorationProb = 0.0
    optimalQLPolicy = {state: ql.getAction(state) for state in allStates}

    # Compute some statistics
    numDifferent = sum(1 for state in allStates
                       if optimalQLPolicy[state] != optimalVIPolicy[state])
    print("{} out of {} states have different actions".format(
        numDifferent, len(allStates)))
Exemplo n.º 12
0
def test4aHidden():
    smallMDP = submission.BlackjackMDP(cardValues=[1,5], multiplicity=2, threshold=10, peekCost=1)
    mdp = smallMDP
    mdp.computeStates()
    rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(),
                                   submission.identityFeatureExtractor,
                                   0.2)
    util.simulate(mdp, rl, 30000)
Exemplo n.º 13
0
def Q4c():
    # s = (3, None, (3,4,0))
    # fv = blackjackFeatureExtractor(s,'Take')
    # print "for state %s , action 'Take' ... \n ... feature vector returned: %s" %(s,fv)

    print "Comparing value iteration ag simulated Q-learning as in 4b but using better featureExtractor:"
    phi = blackjackFeatureExtractor
    mdp = smallMDP  #smallMDP #TOGGLE THIS
    numqtrials = 100  #CHANGE THIS : eg 10, 10000, 300000
    print "...comparison for %s x %s MDP; Q-learning numtrials : %s" % (
        mdp.cardValues, mdp.multiplicity, numqtrials)

    # value iteration:
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(mdp)  #algo applied to the MDP problem

    # q-learning simulate :
    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=phi,
                            explorationProb=0.2)
    totPVs = util.simulate(
        mdp, rl, numTrials=numqtrials,
        verbose=False)  #returns list of totRewards for each trial
    print " ........ # non-zero weights = %s" % sum(
        [1 for k, v in rl.weights.items() if v])

    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print "\n...Comparison of Vopt : "
    print " ... value iteration = expected optimal PV :: optimal utility of startState, stdev: ( %s, 0 )" % (
        solver.V[mdp.startState()])
    print " ... q-learning: avg PV :: utility, stdev over all trials: ( %s, %s ) (see note * below)" % (
        statistics.mean(totPVs), statistics.stdev(totPVs))
    print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est
    # plotQL(totPVs)

    # Comparison of VI and QL policies:
    print "\n...Comparison of policies (rerun with explorationProb = 0) : "
    rl.explorationProb = 0  # rerun QL now with 0 exploration prob (since learned)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials,
                           verbose=False)  #reruns simulation
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est

    diffs = 0  #counts number of differences in policy btw VI and QL
    for s, p in solver.pi.items(
    ):  # using value-iteration policy as starting point
        rlp = max((dotProduct(rl.weights, dict(phi(s, a))), a)
                  for a in rl.actions(s))[1]
        if rlp != p:
            diffs += 1
            print "rlp : %s does not equal VIp : %s for state %s" % (rlp, p, s)
    print "number of different policies btw VI and QL , out of total : %s / %s = %4.2f" % (
        diffs, len(solver.pi), diffs / (1.0 * len(solver.pi)))
Exemplo n.º 14
0
def simulateQL(mdp):
    mdp.computeStates()
    QLAlgorithm = QLearningAlgorithm(mdp.actions, mdp.discount(), identityFeatureExtractor)
    util.simulate(mdp, QLAlgorithm, 30000)
    QLAlgorithm.explorationProb = 0
    stateAndAction = {}
    for state in mdp.states:
        stateAndAction[state] = QLAlgorithm.getAction(state)
    return stateAndAction
Exemplo n.º 15
0
 def test_hidden(self):
   """4a-hidden:  Hidden test for incorporateFeedback(). Run QLearningAlgorithm on smallMDP, then ensure that getQ returns reasonable value."""
   smallMDP = self.run_with_solution_if_possible(submission,
                                                 lambda sub_or_sol: sub_or_sol.BlackjackMDP(cardValues=[1,5], multiplicity=2, threshold=10, peekCost=1))
   smallMDP.computeStates()
   rl = submission.QLearningAlgorithm(smallMDP.actions, smallMDP.discount(),
                                  submission.identityFeatureExtractor,
                                  0.2)
   util.simulate(smallMDP, rl, 30000)
Exemplo n.º 16
0
def simulate_QL_over_MDP(mdp, featureExtractor, verbose=False):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.

    # 4b : identityFeatureExtractor, RL peut performant car la fonction choisie
    # Phi est particulièrement peu généralisable (fonction indicatrice de (s,a)))

    # BEGIN_YOUR_CODE
    print ("simulate_QL_over_MDP")

    # Résolution via Value Iteration
    vi = util.ValueIteration()
    vi.solve(mdp, .0001)
    pi_vi = vi.pi  # pi computed with value iteration

    if verbose:
        print('len pi_vi :  {}'.format(len(pi_vi)))

    # Résolution via Q-Learning
    mdp.computeStates()
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor,
                            0.05)  # meilleur qu'avec un taux d'exploration de 0.2
    util.simulate(mdp, rl, 30000)
    # util.simulate(mdp, rl, numTrials=30000, maxIterations=1000)

    # On connait l'ensemble des états possibles de notre mdp grace à la variable mdp.states (attribut de mdp)
    # Cet attribut est initialisé avec l'appel à la méthode computeStates
    pi_rl = rl.get_pi_opt(mdp.states)  # pi computed with Q-learning (RL)

    if verbose:
        print('len pi_rl :  {}'.format(len(pi_rl)))

    if verbose:
        print('pi : ')
        print('Value Iteration')
        print('Reinforcement Learning')
        print('---')
        for state in mdp.states:
            print('{} : {}'.format(state, pi_vi[state]))
            print('{} : {}'.format(state, pi_rl[state]))
            print('---')

    print('Stats')
    print 'Nb d\'états possibles : ', len(mdp.states)
    equal = 0.
    for state in mdp.states:  # Liste des clés  pi_rl (inclus dans pi_vi, car pi_vi exhaustif)
        if pi_vi[state] == pi_rl[state]:
            equal += 1

    print('Egalités : {0:.2f} %'.format(equal / len(mdp.states) * 100))
    print('---')
Exemplo n.º 17
0
def Q4d():
    origMDP = BlackjackMDP(cardValues=[1, 5],
                           multiplicity=2,
                           threshold=10,
                           peekCost=1)
    newThreshMDP = BlackjackMDP(cardValues=[1, 5],
                                multiplicity=2,
                                threshold=9,
                                peekCost=1)

    #run VI on original MDP to obtain policy:
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(origMDP)  #algo applied to the MDP problem
    print " ... VI Vopt(startState) = %s ." % (solver.V[origMDP.startState()])
    pi0 = solver.pi

    # apply this policy to an agent (in simulated mdp) playing the **new** MDP:
    numqtrials = 30000
    rl = util.FixedRLAlgorithm(pi0)

    mdp = origMDP
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    print " ... QL: avg PV, stdev using above VI opt policy on same mdp: ( %s, %s ) " % (
        statistics.mean(totPVs), statistics.stdev(totPVs))

    mdp = newThreshMDP
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    print "\n ... QL: avg PV, stdev using above VI opt policy on *NEW* mdp: ( %s, %s ) " % (
        statistics.mean(totPVs), statistics.stdev(totPVs))

    # now skip the fixed policy and use QL :
    phi = identityFeatureExtractor  #blackjackFeatureExtractor

    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=phi,
                            explorationProb=0.5)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... QL: est. Vopt of startState : %s " % Vopt_est
    # plotQL(totPVs)

    # Comparison of VI and QL policies:
    rl.explorationProb = 0  # rerun QL now with 0 exploration prob (since learned)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials,
                           verbose=False)  #reruns simulation
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... QL: est. Vopt of startState re-run (with eps = 0) : %s " % Vopt_est
Exemplo n.º 18
0
def compareQLandVI(targetMDP, featureExtractor):
    QL = QLearningAlgorithm(targetMDP.actions, 1, featureExtractor)
    VI = ValueIteration()
    
    util.simulate(targetMDP, QL, numTrials=30000)
    VI.solve(targetMDP)

    diffPolicyStates = []
    QL.explorationProb = 0
    for state in targetMDP.states:
        #print state, QL.getAction(state), VI.pi[state]
        if QL.getAction(state) != VI.pi[state]:
            diffPolicyStates.append(state)
    print("%d/%d = %f%% different states"%(len(diffPolicyStates), len(targetMDP.states), len(diffPolicyStates)/float(len(targetMDP.states))))
Exemplo n.º 19
0
def problem4c():
    print '\n4c now'
    largeMDP.computeStates()

    QL_1 = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(),
                              identityFeatureExtractor, 0.2)
    QL_2 = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(),
                              blackjackFeatureExtractor, 0.2)
    QLReward_1 = util.simulate(largeMDP, QL_1, numTrials=30000)
    QLReward_2 = util.simulate(largeMDP, QL_2, numTrials=30000)

    print('QL reward using identityFeatureExtractor: {}'.format(
        sum(QLReward_1) / float(len(QLReward_1))))
    print('QL reward using blackjackFeatureExtractor: {}'.format(
        sum(QLReward_2) / float(len(QLReward_2))))
Exemplo n.º 20
0
def weight_averages():
    mdp = model.DisasterMDP()
    random.seed(42)
    print('=' * 6, 'initialization', '=' * 6)
    qLearningSolver = util.QLearningAlgorithm(
        mdp.actions, 1, model.joint_bucket_max_feature_extractor)
    print('=' * 6, 'simulating', '=' * 6)
    totalQLRewards, _, _, _ = util.simulate(mdp,
                                            qLearningSolver,
                                            numTrials=num_trials)
    print('Avg QL Reward:', sum(totalQLRewards) / len(totalQLRewards))
    weights = qLearningSolver.weights
    labels = ['resources', 'severities', 'max', 'joint']
    counter = [0, 0, 0, 0]
    sums = [0, 0, 0, 0]
    for w, val in weights.items():
        if 'resource' in w:
            if 'severity' in w:
                counter[3] += 1
                sums[3] += abs(val)
            else:
                counter[0] += 1
                sums[0] += abs(val)
        elif 'severity' in w:
            counter[1] += 1
            sums[1] += abs(val)
        elif 'max_severity' in w:
            counter[2] += 1
            sums[2] += abs(val)
    for i in range(len(sums)):
        sums[i] /= counter[i]
    return labels, sums
Exemplo n.º 21
0
def main():

    with open('true__op.json') as f:     op = json.load(f)          # read parameters
    with open(op['optics']) as f:      opt_op = json.load(f)
    ctf_op = opt_op['ctf']


    v = TIF.pickle_load(op['maps file'])[op['pid']]              # load a 3D density map
    if not op['intensity_positive']:    v = -v

    v = GR.rotate_pad_zero(v, angle=op['rotate_angle'], loc_r=op['translation'])       # rotate image

    p = N.squeeze(v.sum(axis=2))            # make a projection image along z-axis

    ctf = IOC.create(size=p.shape, Dz=ctf_op['Dz'], pix_size=ctf_op['pix_size'], voltage=ctf_op['voltage'], Cs=ctf_op['Cs'], sigma=ctf_op['sigma'])['ctf']

    p_var = p.var()
    n_var = p_var / opt_op['snr']


    # simulate a number of images
    imgs = []
    for i in range(op['image_num']):
        print '\r', i, '                   ',               ;           sys.stdout.flush()
        imgs.append(util.simulate(p=p, ctf=ctf, noise_total_var=n_var))


    with open(op['images_out'], 'wb') as f:     pickle.dump(imgs, f, protocol=-1)
Exemplo n.º 22
0
Arquivo: ep3.py Projeto: pepedrog/BCC
def print_algorithms_compare(mdp, ql, episodes):
    vi = ValueIteration()
    vi.solve(mdp)
    rewards_vi = sum(simulateVI(mdp, vi, episodes))
    rewards_ql = sum(util.simulate(mdp, ql, episodes))
    print("VI | %.4f" % (rewards_vi / episodes))
    print("QL | %.4f" % (rewards_ql / episodes))
Exemplo n.º 23
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valueIteration = ValueIteration()
    valueIteration.solve(original_mdp)
    rl = util.FixedRLAlgorithm(valueIteration.pi)
    rewards = util.simulate(modified_mdp, rl)
    print(sum(rewards) / len(rewards))
    rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor)
    rewards = util.simulate(original_mdp, rl, numTrials=30000)
    rewards = util.simulate(modified_mdp, rl, numTrials=30000)
    print(sum(rewards) / len(rewards))
    # END_YOUR_CODE
Exemplo n.º 24
0
def find_weights_distribution():
    mdp = model.DisasterMDP()
    random.seed(42)
    print('=' * 6, 'initialization', '=' * 6)
    qLearningSolver = util.QLearningAlgorithm(
        mdp.actions, 1, model.joint_bucket_max_feature_extractor)
    print('=' * 6, 'simulating', '=' * 6)
    totalQLRewards, _, _, _ = util.simulate(mdp,
                                            qLearningSolver,
                                            numTrials=num_trials)
    print('Avg QL Reward:', sum(totalQLRewards) / len(totalQLRewards))
    weights = qLearningSolver.weights
    sorted_weights = sorted(weights.items(), key=lambda kv: abs(kv[1]))
    print('Here are the top 10% of weights by absolute value')
    num_keep = 100
    highest_weights = sorted_weights[-1 * num_keep:]
    labels = ['resources', 'severities', 'max', 'joint']
    counter = [0, 0, 0, 0]
    for w, _ in highest_weights:
        if 'resource' in w:
            if 'severity' in w:
                counter[3] += 1
            else:
                counter[0] += 1
        elif 'severity' in w:
            counter[1] += 1
        elif 'max_severity' in w:
            counter[2] += 1
    return labels, counter
Exemplo n.º 25
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE

    # for the reproductivity
    random.seed(123)
    # initialization
    mdp.computeStates()  # to get the whole State Space of MDP
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, 0.2)
    # Value Iteration part
    algorithm = ValueIteration()
    algorithm.solve(
        mdp, .001)  # algorithm now contains the Value and Policy of the mdp.
    # Q-Learning part
    util.simulate(
        mdp, rl, 30000
    )  # Model-Free, Simulate 30000 times. After this Q-learning has been learned.
    rl.explorationProb = 0  # set ε to 0 and then .getAction(state) works as a policy Π.
    Qpi = {}
    comparison = []
    for state in mdp.states:  # get the Q-learning policy and comparison results.
        Qpi[state] = rl.getAction(state)
        comparison.append(int(algorithm.pi[state] == Qpi[state]))
    if featureExtractor == identityFeatureExtractor:
        if mdp.multiplicity == 2:
            print(
                "The match rate of using identityFeatureExtractor for smallMDP: %.4f"
                % (sum(comparison) / len(comparison)))
            print(
                "Number of different actions: %d  Number of total actions: %d"
                % (len(comparison) - sum(comparison), len(comparison)))
        else:
            print(
                "The match rate of using identityFeatureExtractor for largeMDP: %.4f"
                % (sum(comparison) / len(comparison)))
            print(
                "Number of different actions: %d  Number of total actions: %d"
                % (len(comparison) - sum(comparison), len(comparison)))
    else:
        print(
            "The match rate of using blackjackFeatureExtractor for largeMDP: %.4f"
            % (sum(comparison) / len(comparison)))
        print("Number of different actions: %d  Number of total actions: %d" %
              (len(comparison) - sum(comparison), len(comparison)))
Exemplo n.º 26
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    listSmall = util.simulate(smallMDP, QLearningAlgorithm, 30000)
    print listSmall
Exemplo n.º 27
0
def main():
    # smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2,
    #                        threshold = 15, peekCost = 1)
    # mdp_1 = QLearningAlgorithm(
    #    MDP1.actions, MDP1.discount(), identityFeatureExtractor)
    # mdp_2 = QLearningAlgorithm(
    #   MDP2.actions, MDP1.discount(), identityFeatureExtractor)
    vi = ValueIteration()
    vi.solve(largeMDP)
    for _, val in vi.pi.items():
        print(val)
    l_mdp_identity = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(),
                                        identityFeatureExtractor)
    l_mdp_blackjack = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(),
                                         blackjackFeatureExtractor)
    print(util.simulate(largeMDP, l_mdp_identity, 10, 30000, True))
    print(util.simulate(largeMDP, l_mdp_blackjack, 10, 30000, True))
Exemplo n.º 28
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    valueIteration = ValueIteration()
    valueIteration.solve(mdp)
    vi_pi = valueIteration.pi
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor)
    util.simulate(mdp, rl, numTrials=30000, verbose=False)
    rl.explorationProb = 0
    diff, total = 0, len(mdp.states)
    for state in mdp.states:
        if vi_pi[state] != rl.getAction(state):
            diff += 1
    print('{:.3f}'.format(100 * diff / total) + '%')
Exemplo n.º 29
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    vi = ValueIteration()
    vi.solve(original_mdp)
    rewards = util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000)
    print "Expected Reward on modified mdp using original mdp policy: %i" % (
        float(sum(r for r in rewards)) / len(rewards))
    rewards_new = util.simulate(
        modified_mdp,
        QLearningAlgorithm(modified_mdp.actions, original_mdp.discount(),
                           featureExtractor, 0.1), 10000)
    print "Expected Reward on modified mdp using Q Learning: %i" % (
        float(sum(r for r in rewards_new)) / len(rewards_new))
Exemplo n.º 30
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valueIterOriginal = util.ValueIteration()
    valueIterOriginal.solve(original_mdp)
    fixedRL = util.FixedRLAlgorithm(valueIterOriginal.pi)
    rewards = util.simulate(modified_mdp, fixedRL)
    print("Fixed RL")
    for reward in rewards:
        print(reward)
    rewardsFromQ = util.simulate(modified_mdp, QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor))
    print('QLearn')
    for reward in rewardsFromQ:
        print(reward)
Exemplo n.º 31
0
def simulation(mdp1, feature=submission.identityFeatureExtractor):
    learning = submission.QLearningAlgorithm(mdp1.actions, 1, feature)
    rewards = util.simulate(mdp1, learning, numTrials=30000) 

    learning.explorationProb = 0
#states = mdp1.computeStates()
    vi = submission.ValueIteration()
    vi.solve(mdp1)

    total = 0
    same = 0
    for state in mdp1.states:
        print state, vi.pi[state],learning.getAction(state)
        if ( vi.pi[state] == learning.getAction(state) ):
            same += 1
        total += 1
    print "utility %.2f same action percentage is %.2f" % ( sum(rewards) / float(len(rewards)), same / float(total))
Exemplo n.º 32
0
import util
import submission

vi = submission.ValueIteration()
vi.solve(submission.originalMDP)
fixedRLA = util.FixedRLAlgorithm(vi.pi)
rewards = util.simulate(submission.newThresholdMDP, fixedRLA, numTrials=30000) 
print "average utility " + str(sum(rewards)/float(len(rewards)))
rewards = util.simulate(submission.originalMDP, fixedRLA, numTrials=30000) 
print "average utility " + str(sum(rewards)/float(len(rewards)))

mdp2 = submission.newThresholdMDP
learning = submission.QLearningAlgorithm(mdp2.actions, 1, submission.blackjackFeatureExtractor)
rewards = util.simulate(mdp2, learning, numTrials=30000) 
print "average utility " + str(sum(rewards)/float(len(rewards)))
vi2 = submission.ValueIteration()
vi2.solve(submission.newThresholdMDP)
fixed2 = util.FixedRLAlgorithm(vi2.pi)
rewards = util.simulate(submission.newThresholdMDP, fixed2, numTrials=30000) 
print "average utility " + str(sum(rewards)/float(len(rewards)))
Exemplo n.º 33
0
for player in allPlayers:
	last_name, num, team = player.split("-", 2)
	allPlayers[player].stats = all_player_features[num + "-" + team]

'''
for p in allPlayers.keys():
	print "------------"
	print allPlayers[p].name
	print allPlayers[p].team
	print allPlayers[p].position
	print allPlayers[p].price
	print allPlayers[p].stats
	print "------------"
'''


budget = 100.0
mdp = ComputeRosterMDP(players, budget, allTeams, allPlayers)
rl = util.QLearningAlgorithm(mdp.actions, mdp.discount(), util.fantasyFeatureExtractor)
print "Finished in %s iterations" % rl.numIters
bestSequence, qRewards = util.simulate(mdp, rl, numTrials=1, maxIterations=100,verbose=True)
print "qRewards: %s" % (sum(qRewards) / len(qRewards))
bestSequenceNames = [p.name for p in bestSequence]
print "best set of players is", bestSequenceNames
# print "best set of players", rl.bestSequence

#mdp.computeStates()
		


Exemplo n.º 34
0
def testQL():
  deck = poker.Deck()
  deck.shuffle()
  mdp = None
  QL = None
  human = False
  oppType = None
  humanActions = []

    #function to load weight from file.
    #Text in file should be of the format {(feature1): value1, (feature2):value2}
  def loadWeight(fileName):
      with open(fileName,'r') as inf:
          dict_from_file = eval(inf.read())
      return collections.Counter(dict_from_file)


  userInput = raw_input('Type S to simulate QLearning, hit Enter otherwise: ')
  if(userInput == 'S' or userInput == 's'):
    
    print 'What type of opponent would you like to simulate?'
    print '0. Tight-Aggressive'
    print '1. Loose-Aggressive'
    print '2. Tight-Passive'
    print '3. Loose-Passive'
    print '4. Random'
    
    userInput = int(raw_input('Opponent Type: '))
    oppType = ''
    if userInput == 0:
      oppType = 'TAG'
    elif userInput == 1:
      oppType = 'LAG'
    elif userInput == 2:
      oppType = 'TPA'
    elif userInput == 3:
      oppType = 'LPA'
    elif userInput == 4:
      oppType = 'RANDOM'
    
    print 'How many Q-Learning trials do you wish to run?'
    print 'WARNING: We strongly recommend using 1000 trials or less'
    print 'In our experience, 1000 gets done in about 10 minutes most cases'
    print 'Anything over that can take hours'
    
    userTrial = int(raw_input('Number of trials: '))
    print 'How many tests do you want to run on the generated weight vector?'
    numIter = int(raw_input('Number of tests: '))
    mdp = util.pokerMDP(deck, oppType)
    QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2)
    
    print util.simulate(mdp, QL, numTrials=userTrial, maxIterations=10000)
    print QL.weights
    print 'Weight length: %d' %len(QL.weights)
  else:
    human = True
    print 'What type of opponent weight-vector do you wish to start with?'
    print '0. Tight-Aggressive'
    print '1. Loose-Aggressive'
    print '2. Tight-Passive'
    print '3. Loose-Passive'
    print '4. Random'
    
    userInput = int(raw_input('Opponent Type: '))
    oppType = ''
    if userInput == 0:
      
      oppType = 'TAG'
      mdp = util.pokerMDP(deck, oppType)
      QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2)
      QL.weights = loadWeight('w_tag_5k.txt')
    
    elif userInput == 1:
      
      oppType = 'LAG'
      mdp = util.pokerMDP(deck, oppType)
      QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2)
      QL.weights = loadWeight('w_lag_5k.txt')
    
    elif userInput == 2:
      
      oppType = 'TPA'
      mdp = util.pokerMDP(deck, oppType)
      QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2)
      QL.weights = loadWeight('w_tpa_5k.txt')
    
    elif userInput == 3:
      
      oppType = 'LPA'
      mdp = util.pokerMDP(deck, oppType)
      QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2)
      QL.weights = loadWeight('w_lpa_5k.txt')
    
    elif userInput == 4:
      
      oppType = 'RANDOM'
      mdp = util.pokerMDP(deck, oppType)
      QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2)
      QL.weights = loadWeight('w_random_5k.txt')
    
    print 'How many games do you wish to play?'
    print 'Choose more than 10 to enable opponent recognition'
    
    numIter = int(raw_input())
      
    

  def opponentRecognition(l):
    # Assume opponent plays 10 games
    # Features: total money bet, number of folds, number of checks, number of raises
    
    totalBet = 0
    folds = 0
    checks = 0
    raises = 0
    for action in l:
      if action[0] == 'Fold':
        folds += 1
      elif action[1] == 0:
        checks += 1 
      else :
        raises += 1
      totalBet += action[1]
    betPerRaise = (1.0*totalBet)/raises
    if folds > 2:
      # tight
      if betPerRaise > 6 or 3*raises > checks:
        return 'TAG'
      else :
        return 'TPA'
    else :
      # loose
      if betPerRaise > 6 or 3*raises > checks:
        return 'LAG'
      else :
        return 'LPA'
  
  def playGame(QL, deck, table, agent, opp, human):

      def oppPlay(i,agentAction):
          oppState = (agent.hand, table.tableCards, table.bettingPot, agentAction, i)
          if not human:
              oppAction = opp.determinePolicy(oppState)
              table.incrementOppBet(oppAction[1])
              table.appendAction(oppAction)
              return oppAction
          else:
              actions = mdp.actions(oppState)
              index = input('Type action index:' + str(actions))
              print 'Your Action: ' + str(actions[index])
              table.incrementOppBet(actions[index][1])
              table.appendAction(actions[index])
              humanActions.append(actions[index])
              return actions[index]
      
      def agentPlay(i, oppAction):
          agentState = (agent.hand, table.tableCards, table.bettingPot, oppAction, i)
          agentAction = QL.getAction(agentState)
          table.incrementAgentBet(agentAction[1])
          table.appendAction(agentAction)
          if human:
            print 'Agent Action: ' + str(agentAction)
          return agentAction

      def determineFullGameWinner(deck, table, agent, opp):
        cardsNeeded = 5 - len(table.tableCards)
        if cardsNeeded > 0:
          for i in range(cardsNeeded):
            table.flipCard(deck)

          agentVal = agent.assessHand(table.tableCards)
          oppVal = opp.assessHand(table.tableCards)

          agentVal = (agentVal[0], sorted(agentVal[1], reverse=True))
          oppVal = (oppVal[0], sorted(oppVal[1], reverse=True))
    
          if agentVal[0] > oppVal[0]:
            return "Agent"
          elif agentVal[0] == oppVal[0]:
            if agentVal[1] > oppVal[1]:
              return "Agent"
            elif agentVal[1] < oppVal[1]:
              return "Opp"
            return "Tie"
            return 0
          return "Opp"

      # shuffle deck
      deck.shuffle()
      # deal players
      #table.dealPlayers(agent,opp,deck)
      if human:
        print'Your cards are:' + str(opp.hand)

      oppAction = oppPlay(0, (None,0))
                        
      if oppAction[0] == 'Fold':
          agentUtility = table.getOppBet()
          return ('OppLeft', agentUtility)

      agentAction = agentPlay(1,oppAction)                     
      if agentAction[0] == 'Fold':
          if human:
            print 'Agent\'s hand revealed: ' + str(agent.hand)
            print 'You win: %d' %table.bettingPot
          agentUtility = -(table.getAgentBet())
          couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
          if couldHaveWon == "Agent" or couldHaveWon == "Tie":
            return ('GoodFold', agentUtility)
          return ('BadFold', agentUtility)

      # in case agent raises
      if agentAction[1] > oppAction[1]:

          oppAction = oppPlay(2, agentAction)
          if oppAction[0] == 'Fold':
            agentUtility = table.getOppBet()
            return ('OppLeft', agentUtility)

          agentAction = agentPlay(3, oppAction)
          if agentAction[0] == 'Fold':
            if human:
              print 'Agent\'s hand revealed: ' + str(agent.hand)
              print 'You win: %d' %table.bettingPot
            agentUtility = -(table.getAgentBet())
            couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
            if couldHaveWon == "Agent" or couldHaveWon == "Tie":
              return ('GoodFold', agentUtility)
            return ('BadFold', agentUtility)
    
      # deal table - flop
      table.flipCard(deck)
      table.flipCard(deck)
      table.flipCard(deck)

      if human:
          print 'Flop: ' + str(table.tableCards)
          print 'Your cards: ' + str(opp.hand)
          print 'Pot: ' + str(table.bettingPot)
                        
      # asses hand
      oppAction = oppPlay(0, (None,0))
      if oppAction[0] == 'Fold':
          agentUtility = table.getOppBet()
          return ('OppLeft', agentUtility)

      agentAction = agentPlay(1,oppAction)
      if agentAction[0] == 'Fold':
          if human:
              print 'Agent\'s hand revealed: ' + str(agent.hand)
              print 'You win: %d' %table.bettingPot
          agentUtility = -(table.getAgentBet())
          couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
          if couldHaveWon == "Agent" or couldHaveWon == "Tie":
            return ('GoodFold', agentUtility)
          return ('BadFold', agentUtility)

      # in case agent raises
      if agentAction[1] > oppAction[1]:

          oppAction = oppPlay(2, agentAction)
          if oppAction[0] == 'Fold':
            agentUtility = table.getOppBet()
            return ('OppLeft', agentUtility)

          agentAction = agentPlay(3, oppAction)
          if agentAction[0] == 'Fold':
            if human:
              print 'Agent\'s hand revealed: ' + str(agent.hand)
              print 'You win: %d' %table.bettingPot
            agentUtility = -(table.getAgentBet())
            couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
            if couldHaveWon == "Agent" or couldHaveWon == "Tie":
              return ('GoodFold', agentUtility)
            return ('BadFold', agentUtility)
    
      # deal table - turn
      table.flipCard(deck)

      if human:
          print 'Turn: ' + str(table.tableCards)
          print 'Your cards: ' + str(opp.hand)
          print 'Pot: ' + str(table.bettingPot)
                        
      # asses hand
      oppAction = oppPlay(0, (None,0))
      if oppAction[0] == 'Fold':
          agentUtility = table.getOppBet()
          return ('OppLeft', agentUtility)

      agentAction = agentPlay(1,oppAction)
      if agentAction[0] == 'Fold':
          if human:
              print 'Agent\'s hand revealed: ' + str(agent.hand)
              print 'You win: %d' %table.bettingPot
          agentUtility = -(table.getAgentBet())
          couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
          if couldHaveWon == "Agent" or couldHaveWon == "Tie":
            return ('GoodFold', agentUtility)
          return ('BadFold', agentUtility)

      # in case agent raises
      if agentAction[1] > oppAction[1]:

          oppAction = oppPlay(2, agentAction)
          if oppAction[0] == 'Fold':
            agentUtility = table.getOppBet()
            return ('OppLeft', agentUtility)

          agentAction = agentPlay(3, oppAction)
          if agentAction[0] == 'Fold':
            if human:
              print 'Agent\'s hand revealed: ' + str(agent.hand)
              print 'You win: %d' %table.bettingPot
            agentUtility = -(table.getAgentBet())
            couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
            if couldHaveWon == "Agent" or couldHaveWon == "Tie":
              return ('GoodFold', agentUtility)
            return ('BadFold', agentUtility)

      # deal table - river
      table.flipCard(deck)
      if human:
          print 'River: ' + str(table.tableCards)
          print 'Your cards: ' + str(opp.hand)
          print 'Pot: ' + str(table.bettingPot)
                        
      # asses hand
      oppAction = oppPlay(0, (None,0))
      if oppAction[0] == 'Fold':
          agentUtility = table.getOppBet()
          return ('OppLeft', agentUtility)

      agentAction = agentPlay(1,oppAction)
      if agentAction[0] == 'Fold':
          if human:
              print 'Agent\'s hand revealed: ' + str(agent.hand)
              print 'You win: %d' %table.bettingPot
          agentUtility = -(table.getAgentBet())
          couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
          if couldHaveWon == "Agent" or couldHaveWon == "Tie":
            return ('GoodFold', agentUtility)
          return ('BadFold', agentUtility)

      # in case agent raises
      if agentAction[1] > oppAction[1]:

          oppAction = oppPlay(2, agentAction)
          if oppAction[0] == 'Fold':
            agentUtility = table.getOppBet()
            return ('OppLeft', agentUtility)

          agentAction = agentPlay(3, oppAction)
          if agentAction[0] == 'Fold':
            if human:
              print 'Agent\'s hand revealed: ' + str(agent.hand)
              print 'You win: %d' %table.bettingPot
            agentUtility = -(table.getAgentBet())
            couldHaveWon = determineFullGameWinner(deck, table, agent, opp)
            if couldHaveWon == "Agent" or couldHaveWon == "Tie":
              return ('GoodFold', agentUtility)
            return ('BadFold', agentUtility)

      agentVal = agent.assessHand(table.tableCards)
      oppVal = opp.assessHand(table.tableCards)

      agentVal = (agentVal[0], sorted(agentVal[1], reverse=True))
      oppVal = (oppVal[0], sorted(oppVal[1], reverse=True))

      if human:
        print 'Agent\'s hand revealed: ' + str(agent.hand)
                        
      if agentVal[0] > oppVal[0]:
        if human:
          print 'You lose ' + str(table.bettingPot)
        return ('Win', table.getOppBet())
      elif agentVal[0] == oppVal[0]:
          if agentVal[1] > oppVal[1]:
            if human:
              print 'You lose ' + str(table.bettingPot)
            return ('Win', table.getOppBet())
          elif agentVal[1] < oppVal[1]:
              if human:
                  print 'You win ' + str(table.bettingPot)
              return ('Lose', -(table.getAgentBet()))
          return ('Win', 0) #Count ties as a win for simplicity
      if human:
          print 'You win ' + str(table.bettingPot)
      return ('Lose', -(table.getAgentBet()))


  agent = poker.Agent()
  opp = poker.Opponent(oppType)
  table = poker.Table(mdp.deck)
  stateHistory = {}
  utilityHistory = {}
  QL.explorationProb = 0 #No more exploration
  humanMultigameHistory = []
  
  for gameNum in range(numIter):
    mdp.startState()
    if human:
      mdp.table.bettingPot = 0 #Make sure starting pot is zero
    result = playGame(QL, mdp.deck, mdp.table, mdp.agent, mdp.opponent, human)
    state, utility = result[0], result[1]
    if state in stateHistory:
      stateHistory[state] += 1
    else:
      stateHistory[state] = 1
    if utility in utilityHistory:
      utilityHistory[utility] += 1
    else:
      utilityHistory[utility] = 1
    if human:
      humanMultigameHistory.append(humanActions)
      if len(humanMultigameHistory) > 10:
        humanMultigameHistory = humanMultigameHistory[1:]
      if len(humanMultigameHistory) == 10:
        humanActs = []
        for i in range(len(humanMultigameHistory)):
          for j in range(len(humanMultigameHistory[i])):
            humanActs.append(humanMultigameHistory[i][j])
        newOppType = opponentRecognition(humanActs)
        if newOppType != oppType:
          print 'You\'re playing more like a %s player' %newOppType
          print 'Loading %s weight vector' %newOppType
          oppType = newOppType
          mdp = util.pokerMDP(deck, oppType)
          QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2)
          if newOppType == 'TAG':
            QL.weights = loadWeight('w_tag_5k.txt')
          elif newOppType == 'LAG':
            QL.weights = loadWeight('w_lag_5k.txt')
          elif newOppType == 'TPA':
            QL.weights = loadWeight('w_tpa_5k.txt')
          else: #'LPA'
            QL.weights = loadWeight('w_lpa_5k.txt')
    deck.reset()
    agent = poker.Agent() #Easy way to reset agent, opp, table
    opp = poker.Opponent(oppType)
    table = poker.Table(mdp.deck)
  return stateHistory, utilityHistory
Exemplo n.º 35
0
    # END_YOUR_CODE

###########################################################
# Problem 4b: convergence of Q-learning

# Small test case
smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)

# Large test case
largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1)

vi = ValueIteration()
vi.solve(largeMDP)

ql = QLearningAlgorithm(largeMDP.actions, 1, blackjackFeatureExtractor, explorationProb=0.2)
util.simulate(largeMDP, ql, 30000, 1000, False, False)
ifPrint = False
c = 0.0
for state in largeMDP.states:
    QLpi=max((ql.getQ(state, action), action) for action in largeMDP.actions(state))[1] 
    if vi.pi[state] != QLpi:
        c += 1
        ifPrint = True
        print state, 'VI: ',vi.pi[state],'vs  ', 'QL: ', QLpi 
print c / len(largeMDP.states)
if not ifPrint: 
    print 'All policies are same!'
############################################################
# Problem 4d: What happens when the MDP changes underneath you?!

# Original mdp
Exemplo n.º 36
0
def runQLearning():
    global model

    days = list(range(len(X)))
    print len(days)

    randomRewards = 0.0
    testRewards = 0.0

    # Test separately on 100-day periods
    period = 100
    numSets = len(days)/period
    testSets = [n * period for n in range(numSets)]
    for n in testSets:

        print 'Testing on days %d - %d:' % (n, n+period)

        # Test on [n, n + period] examples
        testDays = days[n:n+period]

        # Train on all remaining examples
        trainDays = [d for d in days if d not in testDays]

        # Make train & test MDPs
        trainMDP = GoldMDP(trainDays)
        trainMDP.computeStates()
        testMDP = GoldMDP(testDays)
        testMDP.computeStates()

        # Train linear prediction model on train set
        model = linear_model.LinearRegression()
        model.fit(X[trainDays], Y[trainDays])

        # Measure classification accuracy on test set
        Y_pred = model.predict(X[testDays])
        Y_actual = Y[testDays]
        correct = 0
        for i in range(len(Y_pred)):
            if Y_pred[i] *  Y_actual[i] >= 0: correct = correct + 1
        print "Accuracy = %.2f" % (float(correct)/len(Y_pred))

        # Learn (reinforcement Q-learning) on trainMDP, choosing all random actions
        rl = QLearningAlgorithm(trainMDP.actions, trainMDP.discount(), predictFeatureExtractor, 1.0)
        rewards = util.simulate(trainMDP, rl, 1)
        print rl.weights
        print rewards
        randomRewards = randomRewards + rewards[0]

        # Run with trained RL algorithm on testMDP, choosing all max actions
        rl.explorationProb = 0.0
        rewards = util.simulate(testMDP, rl, 1)
        print rl.weights
        print rewards
        testRewards = testRewards + rewards[0]
        print "Average rewards per day =", rewards[0]/period

    # Output total profit & average daily profit
    print 'Random rewards = %.2f' % randomRewards
    print 'Test rewards = %.2f' % testRewards

    numDays = (len(days)/period) * period
    print 'Avg random = %.2f' % (randomRewards/numDays)
    print 'Avg test = %.2f' % (testRewards/numDays)