Exemplo n.º 1
0
def newThrow(id, player, totPlay):
    if id == -1:
        # print(throw(0, 1, ran(), ran(), ran(), ran(), ran(), ran() ))
        return throw(0, 1, ran(), ran(), ran(), ran(), ran(), ran() )
    else:
        # print(throw(id+1, ((player+1)%totPlay)+1 , ran(), ran(), ran(), ran(), ran(), ran() ))
        return throw(id+1, (player+1)%totPlay , ran(), ran(), ran(), ran(), ran(), ran() )
Exemplo n.º 2
0
def play(method):
    score = throw.START_SCORE
    turns = 0

    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()

    targets = []
    results = []
    while (True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            cc = 1
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)

# print "WOOHOO!  It only took", turns, " turns"
#end_game(turns)
    return turns
Exemplo n.º 3
0
def play(method):
    score = throw.START_SCORE
    turns = 0
    
    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()
        
    targets = []
    results = []
    while(True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            cc=1
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
            
   # print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Exemplo n.º 4
0
def modelfree(alpha, gamma, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_iterations = 0
    Q = [[]] * len(states)

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        Q[s] = [0] * len(actions)

    # play num_games games
    for g in range(1, num_games + 1):
        #print str(g) + "/" + str(num_games)

        # run a single game
        s = throw.START_SCORE
        while s > 0:

            num_iterations += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            a = ex_strategy_one(num_iterations, actions, pi_star, s)
            #a = ex_strategy_two(num_iterations, Q, actions, s, pi_star)
            action = actions[a]

            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = int(s - throw.location_to_score(loc))
            if s_prime < 0:
                s_prime = s
            
            max_Q = max(Q[s_prime])
            Q[s][a] += alpha * (darts.R(s, actions[a]) + gamma * max(Q[s_prime]) - Q[s][a])
            pi_star[s] = Q[s].index(max(Q[s]))

            # Next state becomes current state
            s = s_prime

    print "Average turns = ", float(num_iterations)/float(num_games)
Exemplo n.º 5
0
def play(method):
    global actions
    actions = get_actions()

    score = throw.START_SCORE
    turns = 0
    
    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()
        
    targets = []
    results = []
    while(True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        print "Target: wedge", target.wedge,", ring", target.ring
        print "Result: wedge", result.wedge,", ring", result.ring
        print "Raw Score:", raw_score
        print "Score:", score
        prior = score
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            print
            print "TOO HIGH!"
        modelfree.q_learning(prior, score, get_index(actions, target))                        
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
            
    print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Exemplo n.º 6
0
def play(method, d=None):
    score = throw.START_SCORE
    turns = 0

    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()

    targets = []
    results = []
    while True:
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        if d:
            if d[score] == None:
                d[score] = throw.location_to_score(target)
            else:
                assert d[score] == throw.location_to_score(target)
        # print "Target: wedge", target.wedge,", ring", target.ring
        # print "Result: wedge", result.wedge,", ring", result.ring
        # print "Raw Score:", raw_score
        # print "Score:", score
        if raw_score <= score:
            score = int(score - raw_score)
        # else:
        #     print
        #     print "TOO HIGH!"
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
    # print "WOOHOO!  It only took", turns, " turns"
    # end_game(turns)
    return turns
Exemplo n.º 7
0
def play(method):
    score = throw.START_SCORE
    turns = 0
    
    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()
        
    targets = []
    results = []
    while(True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        #if raw_score > score:
            # update Q[s][a]
        #else:
            #modelfree.Q_learning(score,target,raw_score)
        print "Target: wedge", target.wedge,", ring", target.ring
        print "Result: wedge", result.wedge,", ring", result.ring
        print "Raw Score:", raw_score
        print "Score:", score
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            print
            print "TOO HIGH!"
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)
            
    print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Exemplo n.º 8
0
def play(method):
    score = throw.START_SCORE
    turns = 0

    if method == "mdp":
        target = mdp.start_game(GAMMA)
    else:
        target = modelfree.start_game()

    targets = []
    results = []
    while (True):
        turns = turns + 1
        result = throw.throw(target)
        targets.append(target)
        results.append(result)
        raw_score = throw.location_to_score(result)
        print "Target: wedge", target.wedge, ", ring", target.ring
        print "Result: wedge", result.wedge, ", ring", result.ring
        print "Raw Score:", raw_score
        print "Score:", score
        if raw_score <= score:
            score = int(score - raw_score)
        else:
            print
            print "TOO HIGH!"
        if score == 0:
            break

        if method == "mdp":
            target = mdp.get_target(score)
        else:
            target = modelfree.get_target(score)

    print "WOOHOO!  It only took", turns, " turns"
    #end_game(turns)
    return turns
Exemplo n.º 9
0
def modelbased(gamma, epoch_size, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()
    
    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    num_iterations = 0
    Q = {}
    
    
    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        num_actions[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}
        Q[s] = {}
        
        for a in range(len(actions)):
            num_actions[s][a] = 0
            Q[s][a] = 0
            
        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0


    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
    
     # run a single game
        s = throw.START_SCORE
        while s > 0:

            num_iterations += 1
    
            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            #to_explore = ex_strategy_one(s, num_iterations)
            # Second strategy
            to_explore = 2
            newindex, newaction = ex_strategy_two(s, num_iterations, Q, actions)
    
            if to_explore == 2:
                a = newindex
                action = newaction
            elif to_explore:
             # explore
                a = random.randint(0, len(actions)-1)
                action = actions[a]
            else:
             # exploit
                a = pi_star[s]
                action = actions[a]
            
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = s - throw.location_to_score(loc)
            if s_prime < 0:
                s_prime = s
                
            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

# Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

            if num_iterations % epoch_size == 0:

                # Update transition probabilities
                for i in states:
                    for j in states:
                        for k in range(len(actions)):
                            if num_actions[i][k] != 0:
                                T_matrix[i][j][k] = float(num_transitions[i][j][k]) / float(num_actions[i][k])

                # Update strategy (stored in pi) based on newly updated reward function and transition
                # probabilities
                T_matrix, pi_star, Q = modelbased_value_iteration(gamma, T_matrix, pi_star)
    
    print "Average turns = ", float(num_iterations)/float(num_games)
Exemplo n.º 10
0
def modelbased(gamma, epoch_size, num_games):

    # store all actions (targets on dartboard) in actions array
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    num_iterations = 0

    # initialize v
    V = {}
    V[0] = {}
    V[1] = {}
    for s in states:
        V[0][s] = 0
        V[1][s] = 0

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions) - 1)
        num_actions[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}

        for a in range(len(actions)):
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0

    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
        iterations_this_game = 0
        Q = {}

        # run a single game
        s = throw.START_SCORE
        while s > 0:
            iterations_this_game += 1
            num_iterations += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            a = ex_strategy_one(actions, pi_star, s, iterations_this_game)
            # a = ex_strategy_two(actions, Q, s, iterations_this_game)
            action = actions[a]

            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            s_prime = s - throw.location_to_score(loc)
            if s_prime < 0:
                s_prime = s

            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            # Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

            if num_iterations % epoch_size == 0:

                # Update transition probabilities
                for i in states:
                    for j in states:
                        for k in range(len(actions)):
                            if num_actions[i][k] != 0:
                                T_matrix[i][j][k] = float(
                                    num_transitions[i][j][k]) / float(
                                        num_actions[i][k])

                # Update strategy (stored in pi) based on newly updated reward function and transition
                # probabilities
                T_matrix, pi_star, Q, V = modelbased_value_iteration(
                    gamma, T_matrix, pi_star, actions, states, V)

    avg_turns = float(num_iterations) / float(num_games)
    print "Average turns = ", avg_turns
    return avg_turns
Exemplo n.º 11
0
def modelfree(gamma, learning_rate, num_games, strategy_idx):
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    Q = {}
    num_iterations = 0
    
    
    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions)-1)
        num_actions[s] = {}
        Q[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}
        
        for a in range(len(actions)):
            Q[s][a] = 1.0
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0


    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):
    
        # run a single game
        s = throw.START_SCORE
        throws = 0
        explores = 0
        exploits = 0
        while s > 0:

            num_iterations += 1
            throws += 1
                
            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.
                        
            if(strategy_idx==1):
                to_explore = ex_strategy_one(s,g)
            else:
                to_explore = ex_strategy_two(s,g)
                
            if to_explore:
                # explore
                a = random.randint(0, len(actions)-1)
                action = actions[a]
                explores += 1
            else:
                # exploit
                a = bestAction(Q, s)
                action = actions[a]
                exploits += 1
    
            
            #print "a", a, "action",action
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action) 
            delta =  throw.location_to_score(loc)
            s_prime = s - delta
            if s_prime < 0:
                s_prime = s

                
            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            this_lr = 1 / num_actions[s][a]
            Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr)

            # Next state becomes current state 
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws, 
            # using infinite-horizon value iteration. 
                
        #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits))
        print g,throws,"%1.4f" % (float(explores)/(explores+exploits))
    avg = float(num_iterations)/float(num_games)
    return avg
Exemplo n.º 12
0
    # 	    #to_explore = ex_strategy_two(num_iterations)
    		
    #         if to_explore:
    #         	# explore
    #         	a = random.randint(0, len(actions)-1)
    #         	action = actions[a]
    #         else:
    #         	# exploit
    #         	a = pi_star[s]
    #         	action = actions[a]

  return


           	# Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action) 
            s_prime = int(s - throw.location_to_score(loc))
            if s_prime < 0:
                s_prime = s

            a_prime = q_values[s_prime].index(max(q_values[s_prime]))
            action_prime = actions[a_prime]
            # Update q value for the action we just performed
            q_values[s][a] = q_values[s][a] + learning_rate * (darts.R(s, actions[a]) + gamma * q_values[s_prime][a_prime] - q_values[s][a])

            # Next state becomes current state 
            s = s_prime

    return

   
Exemplo n.º 13
0
def modelfree(gamma, learning_rate, num_games, strategy_idx):
    actions = darts.get_actions()
    states = darts.get_states()

    pi_star = {}
    g = 0
    num_actions = {}
    num_transitions = {}
    T_matrix = {}
    Q = {}
    num_iterations = 0

    # Initialize all arrays to 0 except the policy, which should be assigned a random action for each state.
    for s in states:
        pi_star[s] = random.randint(0, len(actions) - 1)
        num_actions[s] = {}
        Q[s] = {}
        num_transitions[s] = {}
        T_matrix[s] = {}

        for a in range(len(actions)):
            Q[s][a] = 1.0
            num_actions[s][a] = 0

        for s_prime in states:
            num_transitions[s][s_prime] = {}
            T_matrix[s][s_prime] = {}
            for a in range(len(actions)):
                num_transitions[s][s_prime][a] = 0
                T_matrix[s][s_prime][a] = 0

    # play num_games games, updating policy after every EPOCH_SIZE number of throws
    for g in range(1, num_games + 1):

        # run a single game
        s = throw.START_SCORE
        throws = 0
        explores = 0
        exploits = 0
        while s > 0:

            num_iterations += 1
            throws += 1

            # The following two statements implement two exploration-exploitation
            # strategies. Comment out the strategy that you wish not to use.

            if (strategy_idx == 1):
                to_explore = ex_strategy_one(s, g)
            else:
                to_explore = ex_strategy_two(s, g)

            if to_explore:
                # explore
                a = random.randint(0, len(actions) - 1)
                action = actions[a]
                explores += 1
            else:
                # exploit
                a = bestAction(Q, s)
                action = actions[a]
                exploits += 1

            #print "a", a, "action",action
            # Get result of throw from dart thrower; update score if necessary
            loc = throw.throw(action)
            delta = throw.location_to_score(loc)
            s_prime = s - delta
            if s_prime < 0:
                s_prime = s

            # Update experience:
            # increment number of times this action was taken in this state;
            # increment number of times we moved from this state to next state on this action.

            num_actions[s][a] += 1
            num_transitions[s][s_prime][a] += 1

            this_lr = 1 / num_actions[s][a]
            Q[s][a] = newQ(Q, s, a, s_prime, gamma, this_lr)

            # Next state becomes current state
            s = s_prime

            # Update our learned MDP and optimal policy after every EPOCH_SIZE throws,
            # using infinite-horizon value iteration.

        #print "Game",g,"took",throws,"throws (explore ratio %1.4f)" % (float(explores)/(explores+exploits))
        print g, throws, "%1.4f" % (float(explores) / (explores + exploits))
    avg = float(num_iterations) / float(num_games)
    return avg
Exemplo n.º 14
0
def Q_learning(gamma, alpha, num_games):
  
# set these to values that make sense!
  #alpha = .5
  #gamma = .3

  Q = {}
  states = darts.get_states()
  actions = darts.get_actions()
  
  num_iterations = 0
  
  num_total_iterations = 1
  # Initialize all the Q values to zero
  for s in states:
    Q[s]= {}
    for a in actions:
        Q[s][a] = 0
   
  for g in range(1, num_games + 1):
    #print "Average turns = ", float(num_iterations)/float(g)
    #print "GAME {}".format(g)
    # run a single game
    s = throw.START_SCORE
    gamethrows = 0;
    while s > 0:
      num_total_iterations += 1  
      gamethrows += 1
      # The following two statements implement two exploration-exploitation
      # strategies. Comment out the strategy that you wish not to use.
 	  
      #to_explore = ex_strategy_one(num_iterations)
      to_explore = ex_strategy_two(num_total_iterations)
      #to_explore = ex_strategy_three(g, num_games)
      
      action = 0 
      
      if to_explore:
     	#explore
        #print "explore\n"
        a = random.randint(0, len(actions)-1)
        
        action = actions[a]
      #  print "action {}".format(action)
      else:
        # exploit
        num_iterations += 1
        #print "exploit\n"
        action = lookup_max_a(Q,s, actions)
        #print "action {}".format(action)
        #action = a # actions[a]


      # Get result of throw from dart thrower; update score if necessary
      loc = throw.throw(action) 
      #print "score {}".format(s)
      #print "throw value:{}".format(throw.location_to_score(loc))
      #should reward be based on action of loc?
      reward = darts.R(s,action) 
      #print "reward {}".format(reward)
      s_prime = s - throw.location_to_score(loc)
      if s_prime < 0:
        s_prime = s
                
      # now we update the q score table
      #oldQ = copy.deepcopy(Q[s][a])
      oldQ = (Q[s][action])
      #print "oldQ {}".format(oldQ)
      nextQaction = lookup_max_a(Q, s_prime, actions)
      #print "nextQaction {}".format(nextQaction)
      newQ = oldQ + alpha*(reward + gamma*(Q[s_prime][nextQaction]) - oldQ)
      #print "newQ {}".format(newQ)
      Q[s][action] = newQ
      #print "Q[s][a] {}".format(Q[s][a])
      #print "in game {},score {}, throw value {}, oldQ {}, newQ{}".format(g,s,throw.location_to_score(loc),oldQ,newQ)

      s = s_prime
    #print gamethrows
  print "Average turns = ", float(num_iterations)/float(num_games/2)