Пример #1
0
def T(a, s, s_prime):
  total_prob = 0.0
  for w in [x-2 for x in range(5)]:
    wedgefactor = 0.0
    if abs(w)==0: 
      wedgefactor = 0.4
    if abs(w)==1:
      wedgefactor = 0.2
    if abs(w)==2:
      wedgefactor = 0.1
    
    wedge = (a.wedge + w) % throw.NUM_WEDGES
    # this area
    for r in [x-2 for x in range(5)]:
      ringfactor = 0.0
      if abs(r)==0: 
        ringfactor = 0.4
      if abs(r)==1:
        ringfactor = 0.2
      if abs(r)==2:
        ringfactor = 0.1

      ring = abs(a.ring + r)
      if throw.location_to_score(throw.location(ring,wedge))==(s-s_prime):
        total_prob += ringfactor * wedgefactor
    
  return total_prob
Пример #2
0
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s

  p_transition = 0.0
  probabilities = [0.4, 0.2, 0.2, 0.1, 0.1]

  # trick to allow wrap around
  wedge_list = throw.wedges*3

  # calculate all 5 wedges you could end up in when aiming for a.wedge
  wedge_index = len(throw.wedges) + throw.wedges.index(a.wedge)
  candidate_wedges = [wedge_list[wedge_index], wedge_list[wedge_index+1], wedge_list[wedge_index-1], wedge_list[wedge_index+2], wedge_list[wedge_index-2]]

  # calulate all 5 regions/rings (some may be the same) you could end up in when aiming for a.ring, with prob array
  if a.ring == throw.CENTER:
    candidate_rings = [a.ring, throw.INNER_RING, throw.INNER_RING, throw.FIRST_PATCH, throw.FIRST_PATCH]
  elif a.ring == throw.INNER_RING:
    candidate_rings = [a.ring, throw.FIRST_PATCH, throw.CENTER, throw.MIDDLE_RING, throw.INNER_RING]
  else:
    candidate_rings = [a.ring, a.ring+1, a.ring-1, a.ring+2, a.ring-2]

  # for each (ring, wedge) pair, calculate point value, and check if it gets you from s to s_prime
  for w in range(len(candidate_wedges)):
    for r in range(len(candidate_rings)):
      # instantiation of location class
      real_location = throw.location(candidate_rings[r],candidate_wedges[w])
      if s - throw.location_to_score(real_location) == s_prime:
        p_transition += probabilities[r]*probabilities[w]

  return p_transition
Пример #3
0
def T(a, s, s_prime):
#CENTER, INNER_RING, FIRST_PATCH, MIDDLE_RING, SECOND_PATCH, OUTER_RING, MISS = range(7)
  delta = s - s_prime
  p = 0.0
  probs = [.1, .2, .4, .2, .1]
  
  throw.init_board()
  
  if delta > 3*throw.NUM_WEDGES or delta < 0:
    return 0
  
  for ri in range(5):
    for wi in range(5):
      wedge_num = throw.wedges[(throw.angles[a.wedge] - 2 + wi) %
                               throw.NUM_WEDGES]
      ring_num = a.ring - 2 + ri;
      if ring_num > 6:
        ring_num = 6
      if ring_num < 0:
        ring_num = ring_num*(-1)
      
      points = throw.location_to_score(throw.location(ring_num, wedge_num))
      if points == delta:
        p += probs[ri]*probs[wi]
  return p
Пример #4
0
def R(s,a):
    # takes a state s and action a
    # returns the reward for completing action a in state s
    r = s - throw.location_to_score(a)
    if r == 0:
        return 1
    return 0
Пример #5
0
def play(method):
    score = throw.START_SCORE
    turns = 0
    
    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()
        
    targets = []
    results = []
    while(True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            cc=1
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
            
   # print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Пример #6
0
def Q_learning(gamma, numRounds, alpha):
  states = darts.get_states()
  actions = darts.get_actions()
  currentRound = 0
  Q = {}
  for s in states:
  	Q[s] = [0] * len(actions)

  for i in range(numRounds):
    s = throw.START_SCORE
    numiterations = 0
    while s > 0:
      randAction = random.randint(0, len(actions))
      maxAction = Q[score].index(max(Q[s]))

      #a = ex_strategy_one(Q, randAction, maxAction)
      a = ex_strategy_two(numRounds, currentRound, Q, len(actions), s)
      action = actions[a]

      s_prime = s - throw.location_to_score(action)
      if s_prime < 0:
        s_prime = s
        maxQ = 0.0
      for a_prime in range(len(actions)):
        if Q[s_prime][a_prime] > maxQ:
          maxQ = Q[s_prime][a_prime]
      Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a])
      s = s_prime
      currentRound += 1
Пример #7
0
def Q_learning(gamma, numRounds, alpha):
  states = darts.get_states()
  actions = darts.get_actions()


  Q = {}
  for s in states:
  	Q[s] = [0] * len(actions)

  for i in range(numRounds):

  	s = throw.START_SCORE

  	numiterations = 0

  	while s > 0:
  	  randAction = random.randint(0, len(actions))
  	  maxAction = Q[score].index(max(Q[s]))

  	  #a = ex_strategy_one(Q, randAction, maxAction)
  	  a = ex_strategy_two(Q, randAction, maxAction)
  	  action = actions[a]

  	  s_prime = s - throw.location_to_score(action)
  	  if s_prime < 0:
  	  	s_prime = s

  	  maxQ = 0.0
  	  for a_prime in range(len(actions)):
  	  	if Q[s_prime][a_prime] > maxQ:
  	  		maxQ = Q[s_prime][a_prime]

	  Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a])

	  s = s_prime
Пример #8
0
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s

  #so let's iterate over the possible places on the board we will hit and add up the ones that give the right score reduction

  if(s_prime>s):
    return 0.0

  if(s == 0 and s_prime == 0):
    return 1.0

  regions = {CENTER:0, INNER_RING:1, FIRST_PATCH:2, MIDDLE_RING:3, SECOND_PATCH:4,OUTER_RING:5,MISS:6}


  actions = darts.get_actions()

  score_diff = s-s_prime

  prob = 0.0

  wedge = throw.angles[a.wedge]
  ring = a.ring
  for wdel in range(-2,3):
    for rdel in range(-2,3):
      wedge_p = throw.wedges[(wdel+wedge)%NUM_WEDGES]
      ring_p = abs(ring+rdel)
      dscore = throw.location_to_score(throw.location(ring_p,wedge_p))
      if(dscore == score_diff):
        prob += 0.4/(2**abs(wdel))*0.4/(2**abs(rdel))
  return prob
Пример #9
0
def R_simple(s, a):
    # takes a state s and action a
    # returns the reward for completing action a in state s
    points = throw.location_to_score(a)
    if points <= s:
        return points
    return 0
Пример #10
0
def play(method):
    score = throw.START_SCORE
    turns = 0

    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()

    targets = []
    results = []
    while (True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            cc = 1
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)

# print "WOOHOO!  It only took", turns, " turns"
#end_game(turns)
    return turns
Пример #11
0
def R_simple(s,a):
  # takes a state s and action a
  # returns the reward for completing action a in state s
  points = throw.location_to_score(a)
  if points <= s:
      return points
  return 0
Пример #12
0
def get_target(s_):
    global s_old, a_old, num_iterations
    Q_learning(s_old, s_, a_old)

    to_explore = 0
    if darts.strategy == 1:
        to_explore = ex_strategy_one()
    else:
        num_iterations += 1
        to_explore = ex_strategy_two()

    if to_explore:
        a_old = choice(actions)
    else:
        choices = [(value,a) for (a,value) in Q[s_].iteritems()]
        """ If first time at state, shoot for 24 (the max) if score >= max
            Else pick random action that does not exceed score
        """
        if max(choices)[0] == 0:
            if s_ < 24:
                a_old = choice(actions)
                while (throw.location_to_score(a_old) > s_):
                    a_old = choice(actions)
            else:
                a_old = actions[-15]
            return a_old
        # Else pick action with max Q
        a_old = max(choices)[1]
    s_old = s_
    return a_old
Пример #13
0
def T(a, s, s_prime):
    #CENTER, INNER_RING, FIRST_PATCH, MIDDLE_RING, SECOND_PATCH, OUTER_RING, MISS = range(7)
    delta = s - s_prime
    p = 0.0
    probs = [.1, .2, .4, .2, .1]

    throw.init_board()

    if delta > 3 * throw.NUM_WEDGES or delta < 0:
        return 0

    for ri in range(5):
        for wi in range(5):
            wedge_num = throw.wedges[(throw.angles[a.wedge] - 2 + wi) %
                                     throw.NUM_WEDGES]
            ring_num = a.ring - 2 + ri
            if ring_num > 6:
                ring_num = 6
            if ring_num < 0:
                ring_num = ring_num * (-1)

            points = throw.location_to_score(
                throw.location(ring_num, wedge_num))
            if points == delta:
                p += probs[ri] * probs[wi]
    return p
Пример #14
0
def T(a, s, s_prime):
    total_prob = 0.0
    for w in range(-2, 3):
        wedgefactor = 0.0
        if abs(w) == 0:
            wedgefactor = 0.4
        if abs(w) == 1:
            wedgefactor = 0.2
        if abs(w) == 2:
            wedgefactor = 0.1

        wedge = (a.wedge + w) % throw.NUM_WEDGES
        # this area
        for r in range(-2, 3):
            ringfactor = 0.0
            if abs(r) == 0:
                ringfactor = 0.4
            if abs(r) == 1:
                ringfactor = 0.2
            if abs(r) == 2:
                ringfactor = 0.1

            ring = abs(a.ring + r)
            if throw.location_to_score(throw.location(ring,
                                                      wedge)) == (s - s_prime):
                total_prob += ringfactor * wedgefactor

    return total_prob
Пример #15
0
def Q_learning(gamma, numRounds, alpha):
  states = darts.get_states()
  actions = darts.get_actions()
  Q = {}
  for s in states:
  	Q[s] = [0] * len(actions)

  totaliter = 0
  for i in range(numRounds):
    s = throw.START_SCORE
    numiterations = 0
    while s > 0:
      randAction = random.randint(0, len(actions) - 1)
      maxAction = Q[s].index(max(Q[s]))

      a = ex_strategy_one(numRounds, i, randAction, maxAction)
      #a = ex_strategy_two(numRounds, i, Q, len(actions), s)
      action = actions[a]

      s_prime = s - throw.location_to_score(action)
      if s_prime < 0:
        s_prime = s
      maxQ = 0.0
      for a_prime in range(len(actions)):
        if Q[s_prime][a_prime] > maxQ:
          maxQ = Q[s_prime][a_prime]
      Q[s][a] = Q[s][a] + alpha * (darts.R(s, actions[a]) + (gamma * maxQ) - Q[s][a])
      s = s_prime
      numiterations += 1
    totaliter += numiterations

  print "Average number of throws: " + str(float(totaliter) / numRounds)
Пример #16
0
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s
  if (T_CACHE.has_key((a,s,s_prime))):
    return T_CACHE[(a,s,s_prime)]

  def prob(i):
    if i == 0:
      return .4
    if abs(i) == 1:
      return .2
    if abs(i) == 2:
      return .1

  # Useful local variables
  diff = s - s_prime
  wedge_index = throw.wedges.index(a.wedge)

  # Set ring
  for r in [-2,-1,0,1,2]:
    ring = abs(a.ring+r)
    if ring > 7:
      ring = 7
    # Set wedge
    for w in [-2,-1,0,1,2]:
      wedge = throw.wedges[(wedge_index+w) % len(throw.wedges)]
      # Get score
      score = throw.location_to_score(
        throw.location(ring, wedge))
      if score == diff:
        ret = prob(r) * prob(w)
        T_CACHE[(a,s,s_prime)] = ret
        return ret
  return 0.
Пример #17
0
def R(s,a):
  # takes a state s and action a
  # returns the reward for completing action a in state s
  points = throw.location_to_score(a)
  if points > s: 
    return BAD_THROW_PENALTY
  else:
    return points
Пример #18
0
def R(s,a):
  # takes a state s and action a
  # returns the reward for completing action a in state s
  # utility function
  points = throw.location_to_score(a)
  if points <= s:
    return points
  else:
    return 0
Пример #19
0
def R(s,a):
  # takes a state s and action a
  # returns the reward for completing action a in state s
  if s == 0:
    return 0.
  points = throw.location_to_score(a)
  if points > s:
    return -1
  return points-1.
Пример #20
0
def R(s,a):
  # takes a state s and action a
  # returns the reward for completing action a in state s
  if(s == 0):
    return 10.0
  penalty = 0
  if(throw.location_to_score(a)>s):
    penalty = -1
  return penalty#-((throw.START_SCORE+1-s))+penalty
Пример #21
0
 def test_T(self) :
     def act(r,w):
             return throw.location(r,w)
     
     self.assertEqual(mdp.T( act(throw.CENTER, 1), 100, 110), 0.0) 
     self.assertEqual(mdp.T( act(throw.CENTER, 1), 100, 80), mdp.T( act(throw.CENTER,1), 90, 70));
     bullseye = throw.location_to_score(throw.location(throw.CENTER, 1));
     self.assertEqual( mdp.T(act(throw.FIRST_PATCH, 1), 100, 100-bullseye), 0.1);  
     self.assertAlmostEqual( mdp.T(act(throw.INNER_RING, 1), 100, 95), 0.5);  
Пример #22
0
 def test_T(self) :
     def act(r,w):
             return throw.location(r,w)
     
     self.assertAlmostEqual(mdp.T( act(throw.CENTER, 1), 100, 110), 0.0) 
     self.assertAlmostEqual(mdp.T( act(throw.CENTER, 1), 100, 80), mdp.T( act(throw.CENTER,1), 90, 70));
     bullseye = throw.location_to_score(throw.location(throw.CENTER, 1));
     self.assertAlmostEqual( mdp.T(act(throw.FIRST_PATCH, 1), 100, 100-bullseye), 0.1);  
     self.assertAlmostEqual( mdp.T(act(throw.INNER_RING, 1), 100, 95), 0.5);  
Пример #23
0
def T(a, s, s_prime):
    global T_cached

    if (a, s, s_prime) in T_cached:
        return T_cached[(a, s, s_prime)]

    # takes an action a, current state s, and next state s_prime
    # returns the probability of transitioning to s_prime when taking action a in state s
    target = s - s_prime
    target_locations = []
    p = 0.0

    # find all wedge/ring combos that would lead to s -> s' transition
    for i in range(-2, 3):
        current_wedge = get_adj_wedge(a.wedge, i)

        # iterate through all possible rings
        for j in range(-2, 3):
            ring = a.ring + j

            # off dart board
            if ring >= throw.MISS:
                continue

            # allow for ring "wrap around", e.g. the ring inside and outside the center
            # ring is the inner ring
            if ring < 0:
                ring = abs(ring)

            new_location = throw.location(ring, current_wedge)

            # hitting target would go from s -> s'!
            if target == throw.location_to_score(new_location):
                # calculate probability of hitting target
                if i == 0:
                    w_p = 0.4
                elif abs(i) == 1:
                    w_p = 0.2
                elif abs(i) == 2:
                    w_p = 0.1
                else:
                    assert False, "Impossible wedge"

                if j == 0:
                    r_p = 0.4
                elif abs(j) == 1:
                    r_p = 0.2
                elif abs(j) == 2:
                    r_p = 0.1
                else:
                    assert False, "Impossible ring"

                p += (w_p * r_p)

    T_cached[(a, s, s_prime)] = p
    return p
Пример #24
0
def T(a, s, s_prime):
    global T_cached

    if (a, s, s_prime) in T_cached:
        return T_cached[(a, s, s_prime)]

    # takes an action a, current state s, and next state s_prime
    # returns the probability of transitioning to s_prime when taking action a in state s
    target = s - s_prime
    target_locations = []
    p = 0.0

    # find all wedge/ring combos that would lead to s -> s' transition
    for i in range(-2, 3):
        current_wedge = get_adj_wedge(a.wedge, i)

        # iterate through all possible rings
        for j in range(-2, 3):
            ring = a.ring + j

            # off dart board
            if ring >= throw.MISS:
                continue

            # allow for ring "wrap around", e.g. the ring inside and outside the center
            # ring is the inner ring
            if ring < 0:
                ring = abs(ring)

            new_location = throw.location(ring, current_wedge)

            # hitting target would go from s -> s'!
            if target == throw.location_to_score(new_location):
                # calculate probability of hitting target
                if i == 0:
                    w_p = 0.4
                elif abs(i) == 1:
                    w_p = 0.2
                elif abs(i) == 2:
                    w_p = 0.1
                else:
                    assert False, "Impossible wedge"

                if j == 0:
                    r_p = 0.4
                elif abs(j) == 1:
                    r_p = 0.2
                elif abs(j) == 2:
                    r_p = 0.1
                else:
                    assert False, "Impossible ring"

                p += w_p * r_p

    T_cached[(a, s, s_prime)] = p
    return p
Пример #25
0
def T(a, s, s_prime):
    # takes an action a, current state s, and next state s_prime
    # returns the probability of transitioning to s_prime when taking action a in state s
    # figure out where would give you that many points
    # figure out the probability of landing there
    prob = 0
    points = s - s_prime
    if points < 0:
        return 0
    #print points
    
    # Loop through to define transition function
    for i in range(-2,2):
        wedge_curr = (throw.wedges.index(a.wedge) + i)
        # Mod by number of wedges to wrap around if needed
        if wedge_curr >= throw.NUM_WEDGES:
            wedge_curr = wedge_curr%throw.NUM_WEDGES
        prob_wedge = 0.4/(pow(2, abs(i)))
        
        for j in range(-2,2):
            ring_curr = (a.ring + j)
            if ring_curr < 0:
                ring_curr = ring_curr % 7
            prob_ring = 0.4/(pow(2, abs(j)))
            
            '''if (a.ring == 0 and j < 0):
                ring_curr = 7 - ring_curr
            if (a.ring == 1 and j < -1):
                ring_curr = 7 - ring_curr'''
                
            if a.ring == 0:
                ring_curr = 7 - ring_curr
                if ring_curr == 0:
                    prob_ring = 0.4
                if ring_curr == 1:
                    prob_ring == 0.4
                if ring_curr == 2:
                    prob_ring = 0.2
            if a.ring == 1:
                ring_curr = 7 - ring_curr
                if ring_curr == 0:
                    prob_ring == 0.2
                if ring_curr == 1:
                    prob_ring = 0.5
                if ring_curr == 2:
                    prob_ring = 0.2
                if ring_curr == 3:
                    prob_ring == 0.1
            
            #print a.wedge, a.ring, j, i
            if(throw.location_to_score(throw.location(ring_curr, wedge_curr)) == points):
                prob += prob_wedge*prob_ring
                #print a.ring, j, i
    return prob
Пример #26
0
def play(method, d=None):
    score = throw.START_SCORE
    turns = 0

    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()

    targets = []
    results = []
    while True:
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        if d:
            if d[score] == None:
                d[score] = throw.location_to_score(target)
            else:
                assert d[score] == throw.location_to_score(target)
        # print "Target: wedge", target.wedge,", ring", target.ring
        # print "Result: wedge", result.wedge,", ring", result.ring
        # print "Raw Score:", raw_score
        # print "Score:", score
        if raw_score <= score:
            score = int(score - raw_score)
        # else:
        #     print
        #     print "TOO HIGH!"
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
    # print "WOOHOO!  It only took", turns, " turns"
    # end_game(turns)
    return turns
Пример #27
0
def EPoints(a, s):
  probs = [0.4, 0.2, 0.1, 0.1, 0.2]
  total = 0.

  for r_off in [-2, -1, 0, 1, 2]:
    for w_off in [-2, -1, 0, 1, 2]:
      r2 = min(throw.MISS, abs(a.ring + r_off))
      w2 = throw.wedges[(throw.wedges.index(a.wedge) + w_off) % len(throw.wedges)]
      score = throw.location_to_score(throw.location(r2, w2))
      if score > s:
        score = 0.
      total += probs[r_off] * probs[w_off] * score

  return total
Пример #28
0
def T(a, s, s_prime):
    # takes an action a, current state s, and next state s_prime
    # returns the probability of transitioning to s_prime when taking action a in state s
    probabilities = [0 for i in range(throw.START_SCORE + 1)]

    for i in range(-2,2):
        index = throw.wedges.index(a.wedge)+i
        if index >= throw.NUM_WEDGES:
            index = index % throw.NUM_WEDGES
        new_wedge = throw.wedges[index]

        prob_wedge = .4 / (pow(2,abs(i)))

        for j in range(-2,2):
            prob_ring = .4 / (pow(2,abs(j)))
            if a.ring == 0:
                if j == 0:
                    prob_ring = .4
                if j == 1 or j == -1:
                    prob_ring = .4
                if j == 2 or j == -2:
                    prob_ring = .2
            elif a.ring == 1:
                if j == 0 or j == -2: 
                    prob_ring = .5
                if j == -1: 
                    prob_ring = .2
                if j == 1:
                    prob_ring = .2
                if j == 2:
                    prob_ring = .1

            new_ring = a.ring + i
            if new_ring < 0:
                new_ring = new_ring % 7

            loc = throw.location(new_ring, new_wedge)
            score = int(throw.location_to_score(loc))

            new_score = s - score
            if new_score < 0:
                return 0

            prob = prob_wedge * prob_ring
            probabilities[new_score] = probabilities[new_score] + prob

    return probabilities[s_prime]
Пример #29
0
def modelfree(alpha, gamma, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_iterations = 0
    Q = [[]] * len(states)

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        Q[s] = [0] * len(actions)

    # play num_games games
    for g in range(1, num_games + 1):
        #print str(g) + "/" + str(num_games)

        # run a single game
        s = throw.START_SCORE
        while s > 0:

            num_iterations += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            a = ex_strategy_one(num_iterations, actions, pi_star, s)
            #a = ex_strategy_two(num_iterations, Q, actions, s, pi_star)
            action = actions[a]

            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = int(s - throw.location_to_score(loc))
            if s_prime < 0:
                s_prime = s
            
            max_Q = max(Q[s_prime])
            Q[s][a] += alpha * (darts.R(s, actions[a]) + gamma * max(Q[s_prime]) - Q[s][a])
            pi_star[s] = Q[s].index(max(Q[s]))

            # Next state becomes current state
            s = s_prime

    print "Average turns = ", float(num_iterations)/float(num_games)
Пример #30
0
def play(method):
    global actions
    actions = get_actions()

    score = throw.START_SCORE
    turns = 0
    
    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()
        
    targets = []
    results = []
    while(True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        print "Target: wedge", target.wedge,", ring", target.ring
        print "Result: wedge", result.wedge,", ring", result.ring
        print "Raw Score:", raw_score
        print "Score:", score
        prior = score
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            print
            print "TOO HIGH!"
        modelfree.q_learning(prior, score, get_index(actions, target))                        
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
            
    print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Пример #31
0
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s
  possible_rings = []
  ring_prob = []
  if (a.ring == throw.CENTER):
    possible_rings = [throw.CENTER,throw.INNER_RING,throw.FIRST_PATCH]
    ring_prob = [PROBRING,2*PROBR1,2*PROBR2]
  elif (a.ring == throw.INNER_RING):
    possible_rings = [throw.CENTER,throw.INNER_RING,throw.FIRST_PATCH,throw.MIDDLE_RING]
    ring_prob = [PROBR1,PROBRING+PROBR1,PROBR1,PROBR2]
  elif (a.ring == throw.FIRST_PATCH):
    possible_rings = [throw.CENTER,throw.INNER_RING,throw.FIRST_PATCH,throw.MIDDLE_RING,throw.SECOND_PATCH]
    ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1,PROBR2]
  elif (a.ring == throw.MIDDLE_RING):
    possible_rings = [throw.INNER_RING,throw.FIRST_PATCH,throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING]
    ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1,PROBR2]
  elif (a.ring == throw.SECOND_PATCH):
    possible_rings = [throw.FIRST_PATCH,throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS]
    ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1,PROBR2]
  elif (a.ring == throw.OUTER_RING):
    possible_rings = [throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS]
    ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1+PROBR2]
  elif (a.ring == throw.OUTER_RING):
    possible_rings = [throw.MIDDLE_RING,throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS]
    ring_prob = [PROBR2,PROBR1,PROBRING,PROBR1+PROBR2]
  elif (a.ring == throw.MISS):
    possible_rings = [throw.SECOND_PATCH,throw.OUTER_RING,throw.MISS]
    ring_prob = [PROBR2,PROBR1,PROBRING+PROBR1+PROBR2]

  w_index = throw.wedges.index(a.wedge)
  possible_wedges = [(a.wedge),(throw.wedges[(w_index+1)%throw.NUM_WEDGES]),(throw.wedges[(w_index-1)%throw.NUM_WEDGES]),(throw.wedges[(w_index+2)%throw.NUM_WEDGES]),(throw.wedges[(w_index-2)%throw.NUM_WEDGES])]
  wedge_prob = [PROBWEDGE,PROBW1,PROBW1,PROBW2,PROBW2]

  final_prob = 0

  for i in range(len(possible_rings)):
    for j in range(len(possible_wedges)):
      myloc = throw.location(possible_rings[i],possible_wedges[j])
      if (s - (throw.location_to_score(myloc))) == s_prime:
          final_prob = final_prob + (ring_prob[i]*wedge_prob[j])
  return final_prob
Пример #32
0
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s
  if s_prime > s:
    return 0.

  probs = [0.4, 0.2, 0.1, 0.1, 0.2]
  total = 0.

  for r_off in [-2, -1, 0, 1, 2]:
    for w_off in [-2, -1, 0, 1, 2]:
      r2 = min(throw.MISS, abs(a.ring + r_off))
      w2 = throw.wedges[(throw.wedges.index(a.wedge) + w_off) % len(throw.wedges)]
      score = throw.location_to_score(throw.location(r2, w2))
      if score > s:
        score = 0.
      if score == s - s_prime:
        total += probs[r_off] * probs[w_off]

  return total
Пример #33
0
def play(method):
    score = throw.START_SCORE
    turns = 0
    
    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()
        
    targets = []
    results = []
    while(True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        #if raw_score > score:
            # update Q[s][a]
        #else:
            #modelfree.Q_learning(score,target,raw_score)
        print "Target: wedge", target.wedge,", ring", target.ring
        print "Result: wedge", result.wedge,", ring", result.ring
        print "Raw Score:", raw_score
        print "Score:", score
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            print
            print "TOO HIGH!"
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
            
    print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Пример #34
0
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s
  
  probability = 0.0

  # -2 -1 0 1 2
  for w in range(-2, 3):
    # hit the wedge (0)
    if abs(w) == 0:
      p_wedge = 0.4
    # hit region outside the wedge (-1 or 1)
    elif abs(w) == 1:
      p_wedge = 0.2
    # hit region outside of that (-2 or 2)
    else:
      p_wedge = 0.1

    # get the wedge and do % to loop around in case of going around circle
    wedge = (a.wedge + w) % throw.NUM_WEDGES

    # same thing, but now for the ring
    for r in range(-2, 3):
      # hit the ring
      if abs(r) == 0:
        p_ring = 0.4
      # hit region outside the ring
      elif abs(r) == 1:
        p_ring = 0.2
      # hit region outside of that
      else:
        p_ring = 0.1

      # get the ring and do % to loop around in case of going around circle
      ring = abs(a.ring + r)

      score = throw.location_to_score(throw.location(ring, wedge))
      if score == s - s_prime:
        probability += p_wedge * p_ring

  return probability
Пример #35
0
def T(a, s, s_prime):
    # takes an action a, current state s, and next state s_prime
    # returns the probability of transitioning to s_prime when taking action a in state s
    aRing = a.ring
    aWedge = a.wedge
    target = s - s_prime

    probs = [0.4, 0.2, 0.1]

    probability = 0
    for i in range(-2, 3):
        w = (throw.wedges.index(a.wedge) + i) % len(throw.wedges)
        wedge = throw.wedges[w]
        for j in range(-2, 3):
            ring = min(abs(aRing + j), 6)
            loc = throw.location(ring, wedge)
            score = throw.location_to_score(loc)
            if target == score:
                probability += probs[abs(i)] * probs[abs(j)]

    return probability
Пример #36
0
def T(a, s, s_prime):
  # takes an action a, current state s, and next state s_prime
  # returns the probability of transitioning to s_prime when taking action a in state s
  aRing = a.ring
  aWedge = a.wedge
  target = s - s_prime

  probs = [0.4, 0.2, 0.1]

  probability = 0
  for i in range (-2, 3):
    w = (throw.wedges.index(a.wedge) + i) % len(throw.wedges)
    wedge = throw.wedges[w]
    for j in range(-2, 3):
      ring = min(abs(aRing + j), 6)
      loc = throw.location(ring, wedge)
      score = throw.location_to_score(loc)
      if target == score:
        probability += probs[abs(i)] * probs[abs(j)]

  return probability
Пример #37
0
def play(method):
    score = throw.START_SCORE
    turns = 0

    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()

    targets = []
    results = []
    while (True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        print "Target: wedge", target.wedge, ", ring", target.ring
        print "Result: wedge", result.wedge, ", ring", result.ring
        print "Raw Score:", raw_score
        print "Score:", score
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            print
            print "TOO HIGH!"
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)

    print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Пример #38
0
def modelbased(gamma, epoch_size, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()
    
    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    num_iterations = 0
    Q = {}
    
    
    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        num_actions[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}
        Q[s] = {}
        
        for a in range(len(actions)):
            num_actions[s][a] = 0
            Q[s][a] = 0
            
        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0


    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
    
     # run a single game
        s = throw.START_SCORE
        while s > 0:

            num_iterations += 1
    
            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            #to_explore = ex_strategy_one(s, num_iterations)
            # Second strategy
            to_explore = 2
            newindex, newaction = ex_strategy_two(s, num_iterations, Q, actions)
    
            if to_explore == 2:
                a = newindex
                action = newaction
            elif to_explore:
             # explore
                a = random.randint(0, len(actions)-1)
                action = actions[a]
            else:
             # exploit
                a = pi_star[s]
                action = actions[a]
            
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = s - throw.location_to_score(loc)
            if s_prime < 0:
                s_prime = s
                
            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

# Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

            if num_iterations % epoch_size == 0:

                # Update transition probabilities
                for i in states:
                    for j in states:
                        for k in range(len(actions)):
                            if num_actions[i][k] != 0:
                                T_matrix[i][j][k] = float(num_transitions[i][j][k]) / float(num_actions[i][k])

                # Update strategy (stored in pi) based on newly updated reward function and transition
                # probabilities
                T_matrix, pi_star, Q = modelbased_value_iteration(gamma, T_matrix, pi_star)
    
    print "Average turns = ", float(num_iterations)/float(num_games)
Пример #39
0
def modelbased(gamma, epoch_size, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    num_iterations = 0

    # initialize v
    V = {}
    V[0] = {}
    V[1] = {}
    for s in states:
        V[0][s] = 0
        V[1][s] = 0

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions) - 1)
        num_actions[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}

        for a in range(len(actions)):
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0

    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
        iterations_this_game = 0
        Q = {}

        # run a single game
        s = throw.START_SCORE
        while s > 0:
            iterations_this_game += 1
            num_iterations += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            a = ex_strategy_one(actions, pi_star, s, iterations_this_game)
            # a = ex_strategy_two(actions, Q, s, iterations_this_game)
            action = actions[a]

            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = s - throw.location_to_score(loc)
            if s_prime < 0:
                s_prime = s

            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            # Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

            if num_iterations % epoch_size == 0:

                # Update transition probabilities
                for i in states:
                    for j in states:
                        for k in range(len(actions)):
                            if num_actions[i][k] != 0:
                                T_matrix[i][j][k] = float(
                                    num_transitions[i][j][k]) / float(
                                        num_actions[i][k])

                # Update strategy (stored in pi) based on newly updated reward function and transition
                # probabilities
                T_matrix, pi_star, Q, V = modelbased_value_iteration(
                    gamma, T_matrix, pi_star, actions, states, V)

    avg_turns = float(num_iterations) / float(num_games)
    print "Average turns = ", avg_turns
    return avg_turns
Пример #40
0
def modelfree(gamma, learning_rate, num_games, strategy_idx):
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    Q = {}
    num_iterations = 0
    
    
    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        num_actions[s] = {}
        Q[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}
        
        for a in range(len(actions)):
            Q[s][a] = 1.0
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0


    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
    
        # run a single game
        s = throw.START_SCORE
        throws = 0
        explores = 0
        exploits = 0
        while s > 0:

            num_iterations += 1
            throws += 1
                
            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.
                        
            if(strategy_idx==1):
                to_explore = ex_strategy_one(s,g)
            else:
                to_explore = ex_strategy_two(s,g)
                
            if to_explore:
                # explore
                a = random.randint(0, len(actions)-1)
                action = actions[a]
                explores += 1
            else:
                # exploit
                a = bestAction(Q, s)
                action = actions[a]
                exploits += 1
    
            
            #print "a", a, "action",action
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action) 
            delta =  throw.location_to_score(loc)
            s_prime = s - delta
            if s_prime < 0:
                s_prime = s

                
            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            this_lr = 1 / num_actions[s][a]
            Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr)

            # Next state becomes current state 
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, 
            # using infinite-horizon value iteration. 
                
        #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits))
        print g,throws,"%1.4f" % (float(explores)/(explores+exploits))
    avg = float(num_iterations)/float(num_games)
    return avg
Пример #41
0
def modelfree(gamma, learning_rate, num_games, strategy_idx):
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    Q = {}
    num_iterations = 0

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions) - 1)
        num_actions[s] = {}
        Q[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}

        for a in range(len(actions)):
            Q[s][a] = 1.0
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0

    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):

        # run a single game
        s = throw.START_SCORE
        throws = 0
        explores = 0
        exploits = 0
        while s > 0:

            num_iterations += 1
            throws += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            if (strategy_idx == 1):
                to_explore = ex_strategy_one(s, g)
            else:
                to_explore = ex_strategy_two(s, g)

            if to_explore:
                # explore
                a = random.randint(0, len(actions) - 1)
                action = actions[a]
                explores += 1
            else:
                # exploit
                a = bestAction(Q, s)
                action = actions[a]
                exploits += 1

            #print "a", a, "action",action
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            delta = throw.location_to_score(loc)
            s_prime = s - delta
            if s_prime < 0:
                s_prime = s

            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            this_lr = 1 / num_actions[s][a]
            Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr)

            # Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

        #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits))
        print g, throws, "%1.4f" % (float(explores) / (explores + exploits))
    avg = float(num_iterations) / float(num_games)
    return avg