-
Notifications
You must be signed in to change notification settings - Fork 0
/
selfPlay.py
113 lines (105 loc) · 4.52 KB
/
selfPlay.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
@author: Charles Petersen and Jamison Barsotti
"""
from config import *
from agent import Agent
from gameState import GameState
from puctNode import PUCTNode
import ray
import numpy as np
@ray.remote(num_gpus=0.075)
class SelfPlay(object):
'''Abstract class for construction remote self-play actors.
'''
def __init__(self):
'''Instantiates a self-play actor.
Returns
-------
None.
'''
self.agent = Agent(path='./model_data/alpha_0.pt')
def run(self,
replay_buffer,
update_signal,
self_play_id,
search_iters=SELF_PLAY_SEARCH_ITERS,
markov_exp=SELF_PLAY_MARKOV_EXP,
temp=TEMP,
temp_thrshld=TEMP_THRSHLD):
'''Starts indefinite self-play loop. The games for self-play are
generated via an ongoing Markov chain as described in randomDag.py.
The self-play processes are synchronized with one another, train
and evaluation processes via the 'replay_buffer' and 'update_signal',
respectively. 'replay_buffer' stores the self-play data and triggers
the start of training while 'update_signal' triggers model parameter
updates.
Parameters
----------
replay_buffer : ReplayBuffer
remote actor for managing self-play data between self-play processes
and the Train process. Also carries the signal to start training.
update_signal : UpdateSignal
remote actor for synchronization between self-play processes and
evaluation processes. Triggers model parameter updates.
self_play_id : int (nonnegative)
unique identifier for the self-play process.
search_iters : int (positve), optional
the number of search iterations to perform during MCTS.
The default is SELF_PLAY_SEARCH_ITERS.
markov_exp : float, optional
The exponent determining the number of steps taken in
the markov chain in generating games for self-play.
temp : float (nonnegative)
partially controls exploration. If 0, the policy is deterministic
and the position with highest visit from MCTS is chosen.
temp_thrshld : int (nonnegative), optional
The number of moves after which the policy becomes determnistic.
I.e., temp is set to 0. (See temp, above.) The default is
TEMP_THRSHLD.
Returns
-------
None.
'''
# put agent in evaluation mode
self.agent.model.eval()
# the action space...
actions = np.arange(MAX_NODES)
# game state generator via an ongoing Markov chain
state_generator = GameState.state_generator(markov_exp)
# start indefinite self-play loop
while True:
# check for updates
if ray.get(update_signal.get_update.remote(self_play_id)):
# get current update_id
update_id = ray.get(update_signal.get_update_id.remote())
# load current alpha paramenters
self.agent.load_parameters(
path=f'./model_data/alpha_{update_id}.pt'
)
# reset the update signal
update_signal.clear_update.remote(self_play_id)
# get a game and play
initial_state = next(state_generator)
root = PUCTNode(initial_state)
states = []
policies = []
move_count = 0
while not root.state.is_terminal_state():
t = temp if move_count < temp_thrshld else 0
policy = self.agent.MCTS(root, search_iters, t)
move = np.random.choice(actions, p=policy)
states.append(root.state.encoded_state)
policies.append(policy)
root = root.edges[move]
root.to_root()
move_count += 1
# update state values as seen from current players perspective
if move_count %2 == 0:
values = [(-1)**(i+1) for i in range(move_count)]
else:
values = [(-1)**i for i in range(move_count)]
# construct training data from self-play
train_data = [(state, policy, value) for state, policy, value
in zip(states, policies, values)]
# add training data to replay buffer
replay_buffer.add.remote(train_data)