def OnPolicyMCControl(self):
        """ On-policy MC control following Sutton Barto 5.4
        """
        t = time.time()
        while True:
            while time.time() - t < 10:
                num = State().get_num()
                history = [num]
                while not State(from_base10=num).is_terminal():
                    num = self.policy_1.move(num)
                    history.append(num)
                # g is a constant for our case
                g = State(from_base10=num).get_reward()
                for i, num in enumerate(history):
                    if num in self.returns:
                        self.returns[num].append(g)
                    else:
                        self.returns[num] = [g]
                    self.policy_1.v_dict[num] = np.average(self.returns[num])
                if self.policy_1.be_greedy(history):
                    self.policy_stable = False
                self.i_epoch += 1

            t = time.time()
            pickle.dump((self.policy_1, self.i_epoch, self.returns),
                        open(self.path, "wb"))
            print("Trained %i epochs so far." % self.i_epoch)
 def MCPrediction(self, n_epoch):
     """ MC prediction following Sutton Barto 5.1
         Against rush opponent
     Input:
          n_epoch: the number of episodes to be trained
     """
     self.policy_2 = TabularPolicy()
     returns = dict()
     for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
         returns[num] = []
     for _ in range(n_epoch):
         # generate an episode following policy_1
         s = State().get_num()
         history = [s]
         while not State(from_base10=s).is_terminal():
             s = self.policy_1.move_dict[s]
             history.append(s)
             if State(from_base10=s).is_terminal():
                 break
             s = self.policy_2.move_dict[s]
             history.append(s)
         # in our special case, g is a constant
         g = State(from_base10=s).get_reward()
         for i, s in enumerate(history):
             returns[s].append(g)
             if i % 2 == 0:
                 self.policy_1.v_dict[s] = np.average(returns[s])
             else:
                 self.policy_2.v_dict[s] = np.average(returns[s])
     for num in range(int('2' + '0' * 9, 3), int('2' * 10, 3) + 1):
         self.policy_1.v_dict[num] = self.policy_2.v_dict[num]
     self.i_epoch += 1
     pickle.dump((self.policy_1, self.i_epoch), open(self.path, "wb"))
     print('MC prediction finished.')
 def OffPolicyMCControl(self, trajectory, role_behavior_policy):
     """ Incremental implementation of off-policy MC prediction
     Input:
         trajectory
         role_behavior_policy: 1 or 2, denoting which player the behavior policy acted as in this trajectory
     """
     # g is a constant for our case
     g = State(from_base10=trajectory[-1]).get_reward()
     w = 1.
     i = len(trajectory) - 1
     for i, state in reversed(list(enumerate(trajectory))):
         if i == len(trajectory) - 1:
             # ignore the very last state, which is not a beforestate
             continue
         if (i % 2 + 1) != role_behavior_policy:
             # i denotes the number of pieces on the board. i%2+1 is 1 if
             # this is player 1's before state, and is 2 if this is player
             # 2's before state.
             continue
         afterstate = trajectory[i+1]
         if afterstate in self.c:
             self.c[afterstate] += w
         else:
             self.c[afterstate] = w
         self.target_policy.v_dict[afterstate] += w / \
             self.c[afterstate] * \
             (g - self.target_policy.v_dict[afterstate])
         self.target_policy.be_greedy([state])
         if self.target_policy.move_dict[trajectory[i]] != afterstate:
             break
         else:
             w = w * \
                 len(State(from_base10=trajectory[i]).legal_afterstates())
Exemplo n.º 4
0
    def MCES(self):
        """ MC exploring start following Sutton Barto 5.3
            Against rush opponent
        """
        t = time.time()
        # No need to use a list of returns, since the game is deterministic
        for s in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
            #             print(self.policy_1.v_dict[47042])
            history = [s]
            while not State(from_base10=s).is_terminal():
                s = self.policy_1.move_dict[s]
                history.append(s)
                if State(from_base10=s).is_terminal():
                    break
                s = self.policy_1.move_dict[s]
                history.append(s)
            g = State(from_base10=s).get_reward()
            for i, s in enumerate(history):
                self.policy_1.v_dict[s] = g
            if self.policy_1.be_greedy(history):
                self.policy_stable = False
            self.i_epoch += 1
            if time.time() - t > 10:
                t = time.time()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.path, "wb"))
                print("Trained %i epochs so far." % self.i_epoch)

        pickle.dump((self.policy_1, self.i_epoch), open(self.path, "wb"))
        print('MC exploring start finished.')
 def GetInitialState(self):
     """ Return an initial state. 50% chance an empty board (turn = 1), 50% chance a board with a randomly placed X (turn = 2).
     """
     if np.random.rand() < 0.5:
         return State()
     else:
         choices = State().legal_afterstates()
         num = random.choice(choices)
         return State(from_base10=num)
Exemplo n.º 6
0
def test_initialize_state_from_base10():
    """ Legitimate number
    """
    num = int('1012012000', 3)
    state = State(from_base10=num)
    assert state.board == [[0, 1, 2], [0, 1, 2], [0, 0, 0]]
    assert state.turn == 1
    """ Illegitimate number
    """
    num = int('120120120', 3)
    with pytest.raises(ValueError):
        state = State(from_base10=num)
 def GetOneTrajectory(self, policy_1, policy_2):
     """ 
     Returns: list of state nums of a trajectory
     """
     num = State().get_num()
     trajectory = [num]
     while not State(from_base10=num).is_terminal():
         num = policy_1.move(num)
         trajectory.append(num)
         if not State(from_base10=num).is_terminal():
             num = policy_2.move(num)
             trajectory.append(num)
         else:
             break
     return trajectory
Exemplo n.º 8
0
def test_be_greedy():
    policy = TabularPolicy()
    best = State(board=[[0, 0, 0], [1, 0, 0], [0, 0, 0]], turn=2)
    policy.v_dict[best.get_num()] = 1
    assert policy.be_greedy()
    state = State()
    assert policy.move_dict[state.get_num()] == best.get_num()
    assert not policy.be_greedy()  # No more change when run the second time
    def ValueIteration(self, theta=0.01):
        t = time.time()
        while True:
            delta = 0
            for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
                v = self.policy_1.v_dict[num]
                state = State(from_base10=num)
                if state.is_terminal():
                    self.policy_1.v_dict[num] = state.get_reward()
                else:
                    opponent_afterstate = State(
                        from_base10=self.policy_2.move_dict[num])
                    if opponent_afterstate.is_terminal():
                        self.policy_1.v_dict[
                            num] = opponent_afterstate.get_reward()
                    else:
                        s_prime_choices = opponent_afterstate.legal_afterstates(
                        )
                        if state.turn == 2:
                            vi_update = max([
                                self.policy_1.v_dict[x]
                                for x in s_prime_choices
                            ])
                        else:
                            vi_update = min([
                                self.policy_1.v_dict[x]
                                for x in s_prime_choices
                            ])
                        self.policy_1.v_dict[num] = vi_update
                delta = max(delta, np.abs(v - self.policy_1.v_dict[num]))

            self.i_epoch += 1

            if delta < theta:
                print('Value function has converged!')
                print("Trained %i epochs so far." % self.i_epoch)
                self.policy_ever_changed = self.policy_1.be_greedy()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
                break

            if time.time() - t > 10:
                t = time.time()
                print("Trained %i epochs so far." % self.i_epoch)
                self.policy_ever_changed = self.policy_1.be_greedy()
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
Exemplo n.º 10
0
def test_get_trajectory():
    trainer = Train(path='foo', read_first=False)
    trainer.epsilon = 0
    trajectory = trainer.GetOneTrajectory(TabularPolicy(), TabularPolicy())
    num1 = State(board=[[0, 0, 0], [0, 0, 0], [0, 0, 0]]).get_num()
    num2 = State(board=[[1, 0, 0], [0, 0, 0], [0, 0, 0]], turn=2).get_num()
    num3 = State(board=[[1, 2, 0], [0, 0, 0], [0, 0, 0]]).get_num()
    num4 = State(board=[[1, 2, 1], [0, 0, 0], [0, 0, 0]], turn=2).get_num()
    num5 = State(board=[[1, 2, 1], [2, 0, 0], [0, 0, 0]]).get_num()
    num6 = State(board=[[1, 2, 1], [2, 1, 0], [0, 0, 0]], turn=2).get_num()
    num7 = State(board=[[1, 2, 1], [2, 1, 2], [0, 0, 0]]).get_num()
    num8 = State(board=[[1, 2, 1], [2, 1, 2], [1, 0, 0]], turn=2).get_num()
    assert trajectory == [num1, num2, num3, num4, num5, num6, num7, num8]
 def PolicyImprovement(self):
     """ Policy Improvement following Sutton Barto 4.3
         Against rush opponent, with afterstates
     """
     self.policy_stable = True
     for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
         state = State(from_base10=num)
         if not state.is_terminal():
             old_action_num = self.policy_1.move_dict[num]
             # get the best afterstates
             afterstate_nums = state.legal_afterstates()
             afterstate_values = [
                 self.policy_1.v_dict[x] for x in afterstate_nums
             ]
             best = np.argmax(
                 afterstate_values) if state.turn == 1 else np.argmin(
                     afterstate_values)
             self.policy_1.move_dict[num] = afterstate_nums[best]
             if old_action_num != self.policy_1.move_dict[num]:
                 self.policy_stable = False
                 self.policy_ever_changed = True
Exemplo n.º 12
0
def test_get_num_from_state():
    state = State(board=[[0, 1, 2], [0, 1, 2], [0, 0, 0]], turn=2)
    num = state.get_num()
    assert num == int('2012012000', 3)
    state = State(board=[[1, 2, 1], [2, 1, 2], [1, 2, 2]])
    num = state.get_num()
    assert num == int('1121212122', 3)
    def PolicyEvaluation(self):
        """Policy Evaluation following Sutton Barto 4.3
           Against rush opponent, with afterstates
        """
        theta = 0.01
        t = time.time()
        while True:
            delta = 0
            for num in range(int('1' + '0' * 9, 3), int('2' * 10, 3) + 1):
                v = self.policy_1.v_dict[num]
                state = State(from_base10=num)  # here s is afterstate

                # terminal state, v function equals game result (no reward for transition)
                if state.is_terminal():
                    self.policy_1.v_dict[num] = state.get_reward()
                else:
                    # non-terminal afterstates
                    opponent_afterstate = State(
                        from_base10=self.policy_2.move_dict[num])
                    if opponent_afterstate.is_terminal():
                        self.policy_1.v_dict[
                            num] = opponent_afterstate.get_reward()
                    else:
                        s_prime_num = self.policy_1.move_dict[
                            opponent_afterstate.get_num()]
                        self.policy_1.v_dict[num] = self.policy_1.v_dict[
                            s_prime_num]

                delta = max(delta, np.abs(v - self.policy_1.v_dict[num]))

            self.i_epoch += 1

            if delta < theta:
                print('Value function has converged!')
                print("Trained %i epochs so far." % self.i_epoch)
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
                break

            if time.time() - t > 10:
                t = time.time()
                print("Trained %i epochs so far." % self.i_epoch)
                pickle.dump((self.policy_1, self.i_epoch),
                            open(self.write_path, "wb"))
Exemplo n.º 14
0
def test_is_terminal():
    """ Board not full, but player 1 has won
    """
    state = State(board=[[0, 2, 1], [0, 1, 2], [1, 2, 2]])
    assert state.is_terminal()
    """ Board full
  """
    state = State(board=[[1, 2, 1], [2, 1, 2], [1, 2, 2]])
    assert state.is_terminal()
Exemplo n.º 15
0
def test_rush_policy():
    """
    Only one possible move.
    """
    state = State(board=[[1, 2, 1], [2, 2, 1], [0, 1, 2]], turn=1)
    policy = TabularPolicy()
    after_state = State(from_base10=policy.move_dict[state.get_num()])
    expected_after_state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]],
                                 turn=2)
    assert after_state.board == expected_after_state.board
    assert after_state.turn == expected_after_state.turn
    """
    Multiple possible moves.
    """
    state = State(board=[[1, 0, 0], [2, 2, 1], [0, 1, 2]], turn=2)
    policy = TabularPolicy()
    after_state = State(from_base10=policy.move_dict[state.get_num()])
    expected_board = [[1, 2, 0], [2, 2, 1], [0, 1, 2]]
    assert after_state.board == expected_board
    assert after_state.turn == 1
    """
    Filled board
    """
    state = State(board=[[1, 2, 1], [2, 2, 1], [1, 1, 2]], turn=2)
    policy = TabularPolicy()
    with pytest.raises(KeyError):
        after_state = State(from_base10=policy.move_dict[state.get_num()])
Exemplo n.º 16
0
def test_legal_afterstates():
    # full board, no legal afterstate
    state = State(board=[[2, 2, 2], [1, 1, 1], [1, 2, 2]], turn=1)
    assert not state.legal_afterstates()
    # one legal afterstate
    state = State(board=[[2, 2, 2], [1, 1, 1], [1, 0, 2]], turn=1)
    assert state.legal_afterstates() == [
        State([[2, 2, 2], [1, 1, 1], [1, 1, 2]], turn=2).get_num()
    ]
    # 3 legal afterstates
    state = State(board=[[2, 2, 2], [1, 1, 1], [0, 0, 0]], turn=2)
    temp = state.legal_afterstates()
    assert len(temp) == 3
    num1 = State(board=[[2, 2, 2], [1, 1, 1], [2, 0, 0]]).get_num()
    num2 = State(board=[[2, 2, 2], [1, 1, 1], [0, 2, 0]]).get_num()
    num3 = State(board=[[2, 2, 2], [1, 1, 1], [0, 0, 2]]).get_num()
    assert set(temp) == set([num1, num2, num3])
Exemplo n.º 17
0
def test_judge():
    # horizontal
    state = State(board=[[0, 0, 0], [1, 1, 1], [0, 2, 2]], turn=1)
    assert state.judge() == 1
    # vertical
    state = State(board=[[0, 1, 2], [0, 1, 2], [1, 0, 2]], turn=1)
    assert state.judge() == 2
    # diagonal
    state = State(board=[[1, 0, 2], [0, 1, 0], [0, 2, 1]], turn=1)
    assert state.judge() == 1
    # unfinished game
    state = State(board=[[1, 0, 0], [0, 0, 2], [0, 0, 0]], turn=1)
    assert state.judge() == -1
    # tied game
    state = State(board=[[1, 2, 2], [2, 1, 1], [2, 1, 2]], turn=1)
    assert state.judge() == 0
Exemplo n.º 18
0
@author: daugh
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

# state = State(board=[[1, 2, 1],
#                     [2, 2, 1],
#                     [1, 0, 0]], turn=2)
# state.print_board()
# assert policy.v_dict[state.get_num()] == pytest.approx(
#    -0.5, abs=theta), 'Player 2 plays random, one move is winning and one move is leading to a tie, expect value -0.5. Got %f' % policy.v_dict[state.get_num()]

state = State(board=[[1, 0, 0], [0, 0, 0], [0, 0, 0]], turn=2)
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1, abs=theta
), 'Both play rush, player 1 will win. Got %f' % policy.v_dict[state.get_num()]
""" Keep this print statement at the end
"""
print('All assertions passed.')
Exemplo n.º 19
0
from ttt_play import State
from ttt_policies import TabularPolicy
import os
import pickle

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

opponent_policy = TabularPolicy(epsilon=1)
results = []
for i in range(1000):
    state = State()
    while True:
        state = State(from_base10=policy.move_dict[state.get_num()])
        if state.is_terminal():
            break
        else:
            state = State(from_base10=opponent_policy.move(state.get_num()))
            if state.is_terminal():
                break
    results.append(state.get_reward())

print("Average reward %f over 1000 games as player X against random policy." %
      (sum(results) / 1000.))

results = []
for i in range(1000):
Exemplo n.º 20
0
@author: daugh
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Policy iteration against rush opponent. Accuracy %f' % theta)

state = State(board=[[1, 1, 1], [2, 2, 1], [2, 2, 1]], turn=2)
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1, abs=theta
), 'Player 1 wins, expect value 1. Got %f' % policy.v_dict[state.get_num()]

state = State(board=[[1, 1, 2], [2, 2, 1], [1, 2, 1]], turn=2)
assert policy.v_dict[state.get_num()] == pytest.approx(
    0,
    abs=theta), 'Tied. Expect value 0. Got %f' % policy.v_dict[state.get_num()]

state = State(board=[[1, 1, 0], [2, 2, 1], [2, 1, 0]], turn=2)
assert policy.v_dict[state.get_num()] == pytest.approx(
    -1, abs=theta
), 'One step before losing. Expect value -1. Got %f' % policy.v_dict[
    state.get_num()]
Exemplo n.º 21
0
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 15 19:58:57 2019

@author: daugh
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

state = State(board=[[2, 1, 2], [0, 1, 0], [0, 0, 0]])
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1, abs=theta
), 'Player 1 can win next step, expect value 1, got %f' % policy.v_dict[
    state.get_num()]
""" Keep this print statement at the end
"""
print('All assertions passed.')
Exemplo n.º 22
0
 def TrainOneRound(self, afterstate_num, alpha=.1):
     """ Sarsa following Sutton and Barto 6.2
     Input:
         afterstate: the afterstate of target_policy to start trainng with
         Note that the opponent makes a move first, then the target policy.
     """
     afterstate = State(from_base10=afterstate_num)
     while not afterstate.is_terminal():
         beforestate_num = self.opponent_policy.move(afterstate.get_num())
         beforestate = State(from_base10=beforestate_num)
         if beforestate.is_terminal():
             r = beforestate.get_reward()
             self.target_policy.v_dict[afterstate.get_num(
             )] += alpha * (r - self.target_policy.v_dict[afterstate.get_num()])
             break
         else:
             self.target_policy.be_greedy([beforestate_num])
             s_prime_num = self.target_policy.move(beforestate_num)
             s_prime = State(from_base10=s_prime_num)
             r = s_prime.get_reward()
             self.target_policy.v_dict[afterstate.get_num(
             )] += alpha * (r + self.target_policy.v_dict[s_prime_num] - self.target_policy.v_dict[afterstate.get_num()])
             afterstate = s_prime
     self.target_policy.be_greedy([self.start_num])
 def AutoPlay(self, policy_1, policy_2, n_games=100):
     """ Let policy_1 and policy_2 play against each other for n_games
     Input: self explanatory.
     Returns:
          A list of game results, i.e. reward for player 1.
     """
     game_results = []
     for i in range(n_games):
         state = self.GetInitialState()
         if state.turn == 2:
             state = State(from_base10=policy_2.move_dict[state.get_num()])
         while not state.is_terminal():
             state = State(from_base10=policy_1.move_dict[state.get_num()])
             if state.is_terminal():
                 break
             state = State(from_base10=policy_2.move_dict[state.get_num()])
         game_results.append(state.get_reward())
     return game_results
Exemplo n.º 24
0
 def TrainOneRound(self, afterstate_num, alpha=.1):
     """ Q learning following Sutton and Barto 6.5
     Input:
         afterstate: the afterstate of target_policy to start trainng with
         Note that the opponent makes a move first, then the target policy.
     """
     afterstate = State(from_base10=afterstate_num)
     while not afterstate.is_terminal():
         beforestate_num = self.random_policy.move(
             afterstate.get_num())  # opponent makes a move
         beforestate = State(from_base10=beforestate_num)
         if beforestate.is_terminal():
             r = beforestate.get_reward()
             self.target_policy.v_dict[afterstate.get_num()] += alpha * (
                 r - self.target_policy.v_dict[afterstate.get_num()])
             break
         else:
             s_primes = beforestate.legal_afterstates()
             candidates = []
             for s_prime in s_primes:
                 r = State(from_base10=s_prime).get_reward()
                 q = self.target_policy.v_dict[s_prime]
                 candidates.append(r + q)
             if beforestate.turn == 1:
                 self.target_policy.v_dict[
                     afterstate.get_num()] += alpha * (
                         max(candidates) -
                         self.target_policy.v_dict[afterstate.get_num()])
             else:
                 self.target_policy.v_dict[
                     afterstate.get_num()] += alpha * (
                         min(candidates) -
                         self.target_policy.v_dict[afterstate.get_num()])
             afterstate_num = self.random_policy.move(beforestate_num)
             afterstate = State(from_base10=afterstate_num)
Exemplo n.º 25
0
@author: daugh
"""
from ttt_play import State
import os
import pickle
import pytest

policy, i_epoch = pickle.load(
    open(os.path.dirname(os.getcwd()) + '/policy_evaluation.pkl', 'rb'))

print('This value function has been trained for %i epochs.' % i_epoch)
theta = 0.01
print('Accuracy %f' % theta)

state = State(board=[[1, 2, 1], [2, 2, 1], [1, 0, 0]], turn=2)
assert policy.v_dict[state.get_num()] == pytest.approx(
    -0.5, abs=theta
), 'Player 2 plays random, one move is winning and one move is leading to a tie, expect value -0.5. Got %f' % policy.v_dict[
    state.get_num()]

state = State(board=[[2, 1, 0], [2, 1, 0], [1, 2, 0]])
state.print_board()
assert policy.v_dict[state.get_num()] == pytest.approx(
    1. / 3,
    abs=theta), 'Player 1 players random, one move is winning, the other\
        two moves lead to a draw because player 2 (target policy) plays rush. \
        expect value 1/3. Got %f' % policy.v_dict[state.get_num()]
""" Keep this print statement at the end
"""
print('All assertions passed.')