예제 #1
0
파일: modelfree.py 프로젝트: inachen/CS181
def get_target(score):

    actions = darts.get_actions()
    
    a = q_values[score].index(max(q_values[score]))

    return actions[a]
예제 #2
0
def Q_learning(gamma, numRounds, alpha):
  states = darts.get_states()
  actions = darts.get_actions()
  currentRound = 0
  Q = {}
  for s in states:
  	Q[s] = [0] * len(actions)

  for i in range(numRounds):
    s = throw.START_SCORE
    numiterations = 0
    while s > 0:
      randAction = random.randint(0, len(actions))
      maxAction = Q[score].index(max(Q[s]))

      #a = ex_strategy_one(Q, randAction, maxAction)
      a = ex_strategy_two(numRounds, currentRound, Q, len(actions), s)
      action = actions[a]

      s_prime = s - throw.location_to_score(action)
      if s_prime < 0:
        s_prime = s
        maxQ = 0.0
      for a_prime in range(len(actions)):
        if Q[s_prime][a_prime] > maxQ:
          maxQ = Q[s_prime][a_prime]
      Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a])
      s = s_prime
      currentRound += 1
예제 #3
0
파일: mdp.py 프로젝트: acutkosky/cs181
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s

  #so let's iterate over the possible places on the board we will hit and add up the ones that give the right score reduction

  if(s_prime>s):
    return 0.0

  if(s == 0 and s_prime == 0):
    return 1.0

  regions = {CENTER:0, INNER_RING:1, FIRST_PATCH:2, MIDDLE_RING:3, SECOND_PATCH:4,OUTER_RING:5,MISS:6}


  actions = darts.get_actions()

  score_diff = s-s_prime

  prob = 0.0

  wedge = throw.angles[a.wedge]
  ring = a.ring
  for wdel in range(-2,3):
    for rdel in range(-2,3):
      wedge_p = throw.wedges[(wdel+wedge)%NUM_WEDGES]
      ring_p = abs(ring+rdel)
      dscore = throw.location_to_score(throw.location(ring_p,wedge_p))
      if(dscore == score_diff):
        prob += 0.4/(2**abs(wdel))*0.4/(2**abs(rdel))
  return prob
예제 #4
0
def Q_learning(gamma, numRounds, alpha):
  states = darts.get_states()
  actions = darts.get_actions()
  Q = {}
  for s in states:
  	Q[s] = [0] * len(actions)

  totaliter = 0
  for i in range(numRounds):
    s = throw.START_SCORE
    numiterations = 0
    while s > 0:
      randAction = random.randint(0, len(actions) - 1)
      maxAction = Q[s].index(max(Q[s]))

      a = ex_strategy_one(numRounds, i, randAction, maxAction)
      #a = ex_strategy_two(numRounds, i, Q, len(actions), s)
      action = actions[a]

      s_prime = s - throw.location_to_score(action)
      if s_prime < 0:
        s_prime = s
      maxQ = 0.0
      for a_prime in range(len(actions)):
        if Q[s_prime][a_prime] > maxQ:
          maxQ = Q[s_prime][a_prime]
      Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a])
      s = s_prime
      numiterations += 1
    totaliter += numiterations

  print "Average number of throws: " + str(float(totaliter) / numRounds)
예제 #5
0
def Q_learning(gamma, numRounds, alpha):
  states = darts.get_states()
  actions = darts.get_actions()


  Q = {}
  for s in states:
  	Q[s] = [0] * len(actions)

  for i in range(numRounds):

  	s = throw.START_SCORE

  	numiterations = 0

  	while s > 0:
  	  randAction = random.randint(0, len(actions))
  	  maxAction = Q[score].index(max(Q[s]))

  	  #a = ex_strategy_one(Q, randAction, maxAction)
  	  a = ex_strategy_two(Q, randAction, maxAction)
  	  action = actions[a]

  	  s_prime = s - throw.location_to_score(action)
  	  if s_prime < 0:
  	  	s_prime = s

  	  maxQ = 0.0
  	  for a_prime in range(len(actions)):
  	  	if Q[s_prime][a_prime] > maxQ:
  	  		maxQ = Q[s_prime][a_prime]

	  Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a])

	  s = s_prime
예제 #6
0
파일: mdp.py 프로젝트: nealwu/CS181-HW
def infiniteValueIteration(gamma):
  # takes a discount factor gamma and convergence cutoff epislon
  # returns

  V = {}
  Q = {}
  V_prime = {}
  
  states = darts.get_states()
  actions = darts.get_actions()

  notConverged = True

  # intialize value of each state to 0
  for s in states:
    V[s] = 0
    Q[s] = {}

  # until convergence is reached
  while notConverged:

    # store values from previous iteration
    for s in states:
      V_prime[s] = V[s]

    # update Q, pi, and V
    for s in states:
      for a in actions:

        # given current state and action, sum product of T and V over all states
        summand = 0
        for s_prime in states:
          summand += T(a, s, s_prime)*V_prime[s_prime]

        # update Q
        Q[s][a] = darts.R(s, a) + gamma*summand

      # given current state, store the action that maximizes V in pi and the corresponding value in V
      PI[s] = actions[0]
      V[s] = Q[s][PI[s]]
      for a in actions:
        if V[s] <= Q[s][a]:
          V[s] = Q[s][a]
          PI[s] = a

    notConverged = False
    for s in states:
      if abs(V[s] - V_prime[s]) > EPSILON:
        notConverged = True

  # Print table of optimal policy, for writeup purposes
  def print_table():
    for s in [9, 8, 7, 6, 5, 4, 3, 2, 1]:
      EPs = map(lambda a : EPoints(a, s), actions)
      print "  " + str(s) + "     & (" + str(PI[s].wedge) + ", " + str(PI[s].ring) + ")            & " + str(EPoints(PI[s], s)) + "      & [" + str(min(EPs)) + ", " + str(max(EPs)) + "]  & " + str(s - EPoints(PI[s], s)) + "              \\\\"
    print V

  print_table()
예제 #7
0
파일: modelfree.py 프로젝트: inachen/CS181
def start_game(gamma, learning_rate, num_games):

    actions = darts.get_actions()
    states = darts.get_states()

    Q_learning(gamma, learning_rate, num_games)

    a = q_values[throw.START_SCORE].index(max(q_values[throw.START_SCORE]))

    return actions[a]
예제 #8
0
def start_game():
    global states, actions, Q
    states = darts.get_states()
    actions = darts.get_actions()

    for s in states:
        Q[s] = {}
        for a in range(len(actions)):
            Q[s][a] = 0

    return throw.location(throw.INNER_RING, throw.NUM_WEDGES)
예제 #9
0
파일: mdp.py 프로젝트: inachen/CS181
def infiniteValueIteration(gamma):
  # takes a discount factor gamma and convergence cutoff epislon
  # returns

  V = {}
  Q = {}
  V_prime = {}
  
  states = darts.get_states()
  actions = darts.get_actions()

  notConverged = True

  # intialize value of each state to 0
  for s in states:
    V[s] = 0
    Q[s] = {}

  # until convergence is reached
  while notConverged:

    # store values from previous iteration
    for s in states:
      V_prime[s] = V[s]

    # update Q, pi, and V
    for s in states:
      for a in actions:

        # given current state and action, sum product of T and V over all states
        summand = 0
        for s_prime in states:
          summand += T(a, s, s_prime)*V_prime[s_prime]

        # update Q
        Q[s][a] = darts.R(s, a) + gamma*summand

      # given current state, store the action that maximizes V in pi and the corresponding value in V
      PI[s] = actions[0]                                                        
      V[s] = Q[s][PI[s]]                                                        
      for a in actions:                                                         
        if V[s] <= Q[s][a]:                                                     
          V[s] = Q[s][a]                                                        
          PI[s] = a  

    notConverged = False
    for s in states:
      if abs(V[s] - V_prime[s]) > EPSILON:
        notConverged = True
      
# test_score = 9
# test_action = throw.location(throw.OUTER_RING,4)

# print T(test_action,test_score,5)
예제 #10
0
def modelbased_value_iteration(gamma, T_matrix, pi_star, V_n={}):
  V = {}
  V[0] = {}
  V[1] = {}
  converging = 0
  num_iterations = 0
  Q = {}

  # Get all possible actions
  actions = darts.get_actions()

  states = darts.get_states()

  # initialize v
  if len(V_n) == 0:
    for s in states:
      V[0][s] = 0
      V[1][s] = 0
  else:
    for s in states:
      V[0][s] = V_n[s]
      V[1][s] = V_n[s]

  # iterate until all state values (v[s]) converge 
  while not(converging):
    num_iterations += 1
    for s in states:
      for a in range(len(actions)):

        # find the value of each action, given state s 
        Q[a] = darts.R(s, actions[a])
        for s_prime in states:

          Q[a] += gamma * T_matrix[s][s_prime][a] * V[0][s_prime]

        # find the action that maximizes Q and the maximum value of Q
        if a == 0 or (Q[a] >= V[1][s]):
          pi_star[s] = a
          V[1][s] = Q[a]

                  
    # values of v for iteration k become the values of v for iteration k-1
    converging = True
    for s in states:
      # check for one component that does not converge
      if EPSILON_VI < abs(V[0][s] - V[1][s]):
        converging = False

      V[0][s] = V[1][s]

  return T_matrix, pi_star, Q, V[1]

  
예제 #11
0
def infiniteValueIteration(gamma):
    # takes a discount factor gamma and convergence cutoff epislon
    # returns

    V = {}
    Q = {}
    V_prime = {}

    states = darts.get_states()
    actions = darts.get_actions()

    notConverged = True

    # intialize value of each state to 0
    for s in states:
        V[s] = 0
        Q[s] = {}

    # until convergence is reached
    while notConverged:

        # store values from previous iteration
        for s in states:
            V_prime[s] = V[s]

        # update Q, pi, and V
        for s in states:
            for a in actions:

                # given current state and action, sum product of T and V over all states
                summand = 0
                for s_prime in states:
                    summand += T(a, s, s_prime) * V_prime[s_prime]

                # update Q
                Q[s][a] = darts.R(s, a) + gamma * summand

            # given current state, store the action that maximizes V in pi and the corresponding value in V
            PI[s] = actions[0]

            # bug fix from piazza post 283
            V[s] = Q[s][PI[s]]

            for a in actions:
                if V[s] <= Q[s][a]:
                    V[s] = Q[s][a]
                    PI[s] = a

        notConverged = False
        for s in states:
            if abs(V[s] - V_prime[s]) > EPSILON:
                notConverged = True
예제 #12
0
def start_game(gamma):
  global GAMMA
  global Q
  global actions
  GAMMA = gamma
  states = darts.get_states()
  actions = darts.get_actions()
  for s in states:
    Q[s] = {}
    for a in actions:
      Q[s][a]=100


  return choice(actions)#(throw.location(throw.INNER_RING, throw.NUM_WEDGES)) 
예제 #13
0
def start_game():
    global actions, s_old, a_old
    if actions == None:
        print "GAMMA: ", darts.GAMMA
        print "LEARNING_RATE: ", LEARNING_RATE
        print "strategy: ", darts.strategy
        actions = darts.get_actions()
        for s in darts.get_states():
            Q[s] = {}
            for a in actions:
                Q[s][a] = 0.
    s_old = throw.START_SCORE
    a_old = actions[-15]
    return a_old
예제 #14
0
def modelfree(alpha, gamma, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_iterations = 0
    Q = [[]] * len(states)

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        Q[s] = [0] * len(actions)

    # play num_games games
    for g in range(1, num_games + 1):
        #print str(g) + "/" + str(num_games)

        # run a single game
        s = throw.START_SCORE
        while s > 0:

            num_iterations += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            a = ex_strategy_one(num_iterations, actions, pi_star, s)
            #a = ex_strategy_two(num_iterations, Q, actions, s, pi_star)
            action = actions[a]

            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = int(s - throw.location_to_score(loc))
            if s_prime < 0:
                s_prime = s
            
            max_Q = max(Q[s_prime])
            Q[s][a] += alpha * (darts.R(s, actions[a]) + gamma * max(Q[s_prime]) - Q[s][a])
            pi_star[s] = Q[s].index(max(Q[s]))

            # Next state becomes current state
            s = s_prime

    print "Average turns = ", float(num_iterations)/float(num_games)
예제 #15
0
파일: modelfree.py 프로젝트: slymnefe/cs181
def start_game():
    global last_score, last_action, actions

    if last_score is None:
        actions = darts.get_actions()
        for s in darts.get_states():
            Q[s] = {}
            for a in actions:
                Q[s][a] = 0.

    last_score = throw.START_SCORE

    print >> sys.stderr, 'start'

    last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES)
    print >> sys.stderr, last_action
    return last_action
예제 #16
0
파일: modelfree.py 프로젝트: vlsi1217/cs181
def start_game():
  global last_score, last_action, actions

  if last_score is None:
    actions = darts.get_actions()
    for s in darts.get_states():
      Q[s] =  {}
      for a in actions:
        Q[s][a] = 0.

  last_score = throw.START_SCORE

  print >>sys.stderr, 'start'

  last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES)
  print >>sys.stderr, last_action
  return last_action
예제 #17
0
def modelbased_value_iteration(gamma, T_matrix, pi_star):
    V = {}
    V[0] = {}
    V[1] = {}
    converging = 0
    num_iterations = 0
    Q = {}

    # Get all possible actions
    actions = darts.get_actions()

    states = darts.get_states()

    # initialize v
    for s in states:
        V[0][s] = 0
        V[1][s] = 0

    # iterate until all state values (v[s]) converge
    while not (converging):
        num_iterations += 1
        for s in states:
            for a in range(len(actions)):

                # find the value of each action, given state s
                Q[a] = darts.R(s, actions[a])
                for s_prime in states:

                    Q[a] += gamma * T_matrix[s][s_prime][a] * V[0][s_prime]

                    # find the action that maximizes Q and the maximum value of Q
                    if a == 0 or (Q[a] >= V[1][s]):
                        pi_star[s] = a
                        V[1][s] = Q[a]

        # values of v for iteration k become the values of v for iteration k-1
        converging = True
        for s in states:
            # check for one component that does not converge
            if EPSILON_VI < abs(V[0][s] - V[1][s]):
                converging = False

            V[0][s] = V[1][s]

    return T_matrix, pi_star
예제 #18
0
파일: modelfree.py 프로젝트: inachen/CS181
def Q_learning(gamma, learning_rate, num_games):


    g = 0
    num_iterations = 0

	# store all actions (targets on dartboard) in actions array
    # actions = darts.get_actions()
    # states = darts.get_states()


    actions = darts.get_actions()

    states = darts.get_states()
    
    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        list_a = []
        for a in range(len(actions)):
            list_a.append(0)
        q_values.append(list_a)

    for g in range(1, num_games + 1):

    	# run a single game
        s = throw.START_SCORE
        while s > 0:

            num_iterations += 1

        	# which strategy to use
            #to_explore = ex_strategy_one(num_iterations)
            to_explore = ex_strategy_two(num_iterations)

            if to_explore:
            	# explore
                a = random.randint(0, len(actions)-1)
                action = actions[a]
            else:
            	# exploit
                a = q_values[s].index(max(q_values[s]))
                action = actions[a]
예제 #19
0
def start_game():
  global Q, states, actions, cur_s, last_a, throws

  cur_s = throw.START_SCORE
  throws = 1
  
  # only initialize states, actions, and Q once
  if states == None:
    states = darts.get_states()
  if actions == None:
    actions = darts.get_actions()

  if len(Q) == 0:
    for s in states:
      Q[s] = {}
      for a in actions:
        Q[s][a] = 0.0
  
  # start by returning uniform random action
  last_a = choice(actions)
  return last_a
예제 #20
0
def start_game():
    global Q, states, actions, cur_s, last_a, throws

    cur_s = throw.START_SCORE
    throws = 1

    # only initialize states, actions, and Q once
    if states == None:
        states = darts.get_states()
    if actions == None:
        actions = darts.get_actions()

    if len(Q) == 0:
        for s in states:
            Q[s] = {}
            for a in actions:
                Q[s][a] = 0.0

    # start by returning uniform random action
    last_a = choice(actions)
    return last_a
예제 #21
0
def modelfree(gamma, learning_rate, num_games, strategy_idx):
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    Q = {}
    num_iterations = 0
    
    
    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        num_actions[s] = {}
        Q[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}
        
        for a in range(len(actions)):
            Q[s][a] = 1.0
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0


    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
    
        # run a single game
        s = throw.START_SCORE
        throws = 0
        explores = 0
        exploits = 0
        while s > 0:

            num_iterations += 1
            throws += 1
                
            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.
                        
            if(strategy_idx==1):
                to_explore = ex_strategy_one(s,g)
            else:
                to_explore = ex_strategy_two(s,g)
                
            if to_explore:
                # explore
                a = random.randint(0, len(actions)-1)
                action = actions[a]
                explores += 1
            else:
                # exploit
                a = bestAction(Q, s)
                action = actions[a]
                exploits += 1
    
            
            #print "a", a, "action",action
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action) 
            delta =  throw.location_to_score(loc)
            s_prime = s - delta
            if s_prime < 0:
                s_prime = s

                
            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            this_lr = 1 / num_actions[s][a]
            Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr)

            # Next state becomes current state 
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, 
            # using infinite-horizon value iteration. 
                
        #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits))
        print g,throws,"%1.4f" % (float(explores)/(explores+exploits))
    avg = float(num_iterations)/float(num_games)
    return avg
예제 #22
0
import random
import throw
import darts
 
# The default player aims for the maximum score, unless the
# current score is less than the number of wedges, in which
# case it aims for the exact score it needs. 
#  
# You may use the following functions as a basis for 
# implementing the Q learning algorithm or define your own 
# functions.

ACTIVE_STRATEGY=1;

actions = darts.get_actions()
states = darts.get_states()

gamma = .5
learning_rate = .1
num_games = 10

#def start_game():
#    num_throws_this = 1
#    last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES)
#    return(last_action)

#def update_counts(score):
#    last_delta = score - last_state
#    update_T(last_state, last_action, last_delta)

#def get_target(score):
예제 #23
0
import random
import throw
import darts

# The default player aims for the maximum score, unless the
# current score is less than the number of wedges, in which
# case it aims for the exact score it needs.
#
# You may use the following functions as a basis for
# implementing the Q learning algorithm or define your own
# functions.

ACTIVE_STRATEGY = 1

actions = darts.get_actions()
states = darts.get_states()

gamma = .5
learning_rate = .1
num_games = 10

#def start_game():
#    num_throws_this = 1
#    last_action = throw.location(throw.INNER_RING, throw.NUM_WEDGES)
#    return(last_action)

#def update_counts(score):
#    last_delta = score - last_state
#    update_T(last_state, last_action, last_delta)

#def get_target(score):
예제 #24
0
def modelfree(gamma, learning_rate, num_games, strategy_idx):
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    Q = {}
    num_iterations = 0

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions) - 1)
        num_actions[s] = {}
        Q[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}

        for a in range(len(actions)):
            Q[s][a] = 1.0
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0

    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):

        # run a single game
        s = throw.START_SCORE
        throws = 0
        explores = 0
        exploits = 0
        while s > 0:

            num_iterations += 1
            throws += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            if (strategy_idx == 1):
                to_explore = ex_strategy_one(s, g)
            else:
                to_explore = ex_strategy_two(s, g)

            if to_explore:
                # explore
                a = random.randint(0, len(actions) - 1)
                action = actions[a]
                explores += 1
            else:
                # exploit
                a = bestAction(Q, s)
                action = actions[a]
                exploits += 1

            #print "a", a, "action",action
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            delta = throw.location_to_score(loc)
            s_prime = s - delta
            if s_prime < 0:
                s_prime = s

            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            this_lr = 1 / num_actions[s][a]
            Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr)

            # Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

        #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits))
        print g, throws, "%1.4f" % (float(explores) / (explores + exploits))
    avg = float(num_iterations) / float(num_games)
    return avg
예제 #25
0
파일: mdp_2.py 프로젝트: vlsi1217/cs181
def count_stuff(s,s_prime):
  total = 0.0
  for a in darts.get_actions():
    total = total + T(a, s, s_prime)
  return total
예제 #26
0
def count_stuff(s, s_prime):
    total = 0.0
    for a in darts.get_actions():
        total = total + T(a, s, s_prime)
    return total
예제 #27
0
def modelbased(gamma, epoch_size, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    num_iterations = 0

    # initialize v
    V = {}
    V[0] = {}
    V[1] = {}
    for s in states:
        V[0][s] = 0
        V[1][s] = 0

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions) - 1)
        num_actions[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}

        for a in range(len(actions)):
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0

    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
        iterations_this_game = 0
        Q = {}

        # run a single game
        s = throw.START_SCORE
        while s > 0:
            iterations_this_game += 1
            num_iterations += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            a = ex_strategy_one(actions, pi_star, s, iterations_this_game)
            # a = ex_strategy_two(actions, Q, s, iterations_this_game)
            action = actions[a]

            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = s - throw.location_to_score(loc)
            if s_prime < 0:
                s_prime = s

            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            # Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

            if num_iterations % epoch_size == 0:

                # Update transition probabilities
                for i in states:
                    for j in states:
                        for k in range(len(actions)):
                            if num_actions[i][k] != 0:
                                T_matrix[i][j][k] = float(
                                    num_transitions[i][j][k]) / float(
                                        num_actions[i][k])

                # Update strategy (stored in pi) based on newly updated reward function and transition
                # probabilities
                T_matrix, pi_star, Q, V = modelbased_value_iteration(
                    gamma, T_matrix, pi_star, actions, states, V)

    avg_turns = float(num_iterations) / float(num_games)
    print "Average turns = ", avg_turns
    return avg_turns
예제 #28
0
def modelbased(gamma, epoch_size, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()
    
    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    num_iterations = 0
    Q = {}
    
    
    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        num_actions[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}
        Q[s] = {}
        
        for a in range(len(actions)):
            num_actions[s][a] = 0
            Q[s][a] = 0
            
        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0


    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
    
     # run a single game
        s = throw.START_SCORE
        while s > 0:

            num_iterations += 1
    
            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            #to_explore = ex_strategy_one(s, num_iterations)
            # Second strategy
            to_explore = 2
            newindex, newaction = ex_strategy_two(s, num_iterations, Q, actions)
    
            if to_explore == 2:
                a = newindex
                action = newaction
            elif to_explore:
             # explore
                a = random.randint(0, len(actions)-1)
                action = actions[a]
            else:
             # exploit
                a = pi_star[s]
                action = actions[a]
            
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = s - throw.location_to_score(loc)
            if s_prime < 0:
                s_prime = s
                
            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

# Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

            if num_iterations % epoch_size == 0:

                # Update transition probabilities
                for i in states:
                    for j in states:
                        for k in range(len(actions)):
                            if num_actions[i][k] != 0:
                                T_matrix[i][j][k] = float(num_transitions[i][j][k]) / float(num_actions[i][k])

                # Update strategy (stored in pi) based on newly updated reward function and transition
                # probabilities
                T_matrix, pi_star, Q = modelbased_value_iteration(gamma, T_matrix, pi_star)
    
    print "Average turns = ", float(num_iterations)/float(num_games)
예제 #29
0
파일: modelfree.py 프로젝트: rlucioni/cs181
def Q_learning(gamma, alpha, num_games):
  
# set these to values that make sense!
  #alpha = .5
  #gamma = .3

  Q = {}
  states = darts.get_states()
  actions = darts.get_actions()
  
  num_iterations = 0
  
  num_total_iterations = 1
  # Initialize all the Q values to zero
  for s in states:
    Q[s]= {}
    for a in actions:
        Q[s][a] = 0
   
  for g in range(1, num_games + 1):
    #print "Average turns = ", float(num_iterations)/float(g)
    #print "GAME {}".format(g)
    # run a single game
    s = throw.START_SCORE
    gamethrows = 0;
    while s > 0:
      num_total_iterations += 1  
      gamethrows += 1
      # The following two statements implement two exploration-exploitation
      # strategies. Comment out the strategy that you wish not to use.
 	  
      #to_explore = ex_strategy_one(num_iterations)
      to_explore = ex_strategy_two(num_total_iterations)
      #to_explore = ex_strategy_three(g, num_games)
      
      action = 0 
      
      if to_explore:
     	#explore
        #print "explore\n"
        a = random.randint(0, len(actions)-1)
        
        action = actions[a]
      #  print "action {}".format(action)
      else:
        # exploit
        num_iterations += 1
        #print "exploit\n"
        action = lookup_max_a(Q,s, actions)
        #print "action {}".format(action)
        #action = a # actions[a]


      # Get result of throw from dart thrower; update score if necessary
      loc = throw.throw(action) 
      #print "score {}".format(s)
      #print "throw value:{}".format(throw.location_to_score(loc))
      #should reward be based on action of loc?
      reward = darts.R(s,action) 
      #print "reward {}".format(reward)
      s_prime = s - throw.location_to_score(loc)
      if s_prime < 0:
        s_prime = s
                
      # now we update the q score table
      #oldQ = copy.deepcopy(Q[s][a])
      oldQ = (Q[s][action])
      #print "oldQ {}".format(oldQ)
      nextQaction = lookup_max_a(Q, s_prime, actions)
      #print "nextQaction {}".format(nextQaction)
      newQ = oldQ + alpha*(reward + gamma*(Q[s_prime][nextQaction]) - oldQ)
      #print "newQ {}".format(newQ)
      Q[s][action] = newQ
      #print "Q[s][a] {}".format(Q[s][a])
      #print "in game {},score {}, throw value {}, oldQ {}, newQ{}".format(g,s,throw.location_to_score(loc),oldQ,newQ)

      s = s_prime
    #print gamethrows
  print "Average turns = ", float(num_iterations)/float(num_games/2)