-
Notifications
You must be signed in to change notification settings - Fork 0
/
ProgressiveTrainer.py
488 lines (409 loc) · 18.3 KB
/
ProgressiveTrainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
import math
import torch
import numpy as np
from torch import nn, optim
from AStar import AStar
from PathPlanningEnv import PathPlanningEnv
class Loc():
def __init__(self, row, col):
self.row = row
self.col = col
def copy(self):
return Loc(self.row, self.col)
def copy_from(self, loc):
self.row = loc.row
self.col = loc.col
def move(self, direction):
row = self.row
col = self.col
if direction == 'u' or direction == 0:
row -= 1
elif direction == 'd' or direction == 1:
row += 1
elif direction == 'l' or direction == 2:
col -= 1
elif direction == 'r' or direction == 3:
col += 1
else:
raise RuntimeError("Error: unknown move")
return Loc(row, col)
def __eq__(self, other):
return (self.row == other.row) and (self.col == other.col)
def ReverseAction(action):
'''
Input -> Return:
1 -> 0
0 -> 1
2 -> 3
3 -> 2
'''
parity = action % 2
change = parity * (-2) + 1
return action + change
def Softmax(l):
exps = [math.e ** i for i in l]
s = sum(exps)
return [i/s for i in exps]
class ProgressiveTrainer:
def __init__(self, model, height=10, width=10, num_obstacle=10, max_play_length=500,
epsilon_high=0.9, epsilon_low=0.1, gamma=0.9, lr=0.01,
init_env_num=1, max_env_num=20, env_inc_acc=0.9, env_final_acc=0.98, seed=42,
loss_func=nn.MSELoss(), device='cpu'):
'''
Parameters:
model: the action-value function network
height: map height
width: map width
num_obstacle: number of obstacles in the map
max_play_length: maximum number of steps in each epoch
epsilon_high: high epsilon value
epsilon_low: low epsilon value
gamma: discounting factor
lr: learning rate
max_env_num: the maximum number of envs
env_inc_acc: if all existing envs reach this accuracy, a new env will be added
env_final_acc: if all existing envs reach this accuracy and max_env_num is reached, the training stops
loss_func: loss function
device: running device
'''
self.model = model
self.loss_func = loss_func
self.device = device
self.optimizer = optim.SGD(model.parameters(), lr=lr)
self.height = height
self.width = width
self.num_obstacle = num_obstacle
self.epsilon_high = epsilon_high
self.epsilon_low = epsilon_low
self.gamma = gamma
self.max_play_length = max_play_length
self.model.to(device)
self.device = device
self.envs = []
self.init_env_num = init_env_num
self.max_env_num = max_env_num
self.env_inc_acc = env_inc_acc
self.env_final_acc = env_final_acc
self.seed = seed
self.train_state = {}
while len(self.envs) < self.init_env_num:
self._add_new_env()
def _add_new_env(self):
'''
Add a new environment to the environment list.
'''
# Generate a random map and make sure that all non-obstacle
# positions can reach the goal.
while True:
model_settings = {
'height' : self.height,
'width' : self.width,
'obs_count' : self.num_obstacle,
'random_seed': self.seed,
'device': self.device
}
self.seed += 1
new_env = PathPlanningEnv(**model_settings)
astar = AStar(new_env.grid[2,:,:], (new_env.goal_row, new_env.goal_col))
all_reachable = True
for row in range(self.height):
for col in range(self.width):
if new_env.grid[2,row,col] == 0 \
and new_env.grid[1,row,col] != 1 \
and not astar.plan(row, col):
all_reachable = False
if all_reachable: break
self.envs.append(new_env)
def _gen_vec_map(self, env):
'''
Generate the vector map of an environment. Each entry in the vector map
represents the agent's action (next step direction).
'''
# Vector map value: -2: obstacle, -1: goal, 0-3: directions
vec_map = torch.full((env.height, env.width), -2, dtype=torch.int8, requires_grad=False).to(self.device)
# for i in range(env.height):
# for j in range(env.width):
# old_obs = env.grid[2,:,:]
# env._init_from_grid(old_obs, i, j, env.goal_row, env.goal_col, self.device)
# state = env.grid
# if env.grid[2, i, j] == 0 and (i, j) != (env.goal_row, env.goal_col):
# preds = []
# state = env.grid.clone().detach()
# state = state.view(1, *state.shape)
# for action in env.actions:
# action = action.view(1, *action.shape)
# pred = self.model(state, action)
# preds.append(pred.item())
# preds = Softmax(preds)
# max_index = preds.index(max(preds))
# vec_map[i, j] = max_index
# vec_map[env.goal_row, env.goal_col] = -1
# return vec_map
# for i in range(env.height):
# for j in range(env.width):
# old_obs = env.grid[2,:,:]
# env._init_from_grid(old_obs, i, j, env.goal_row, env.goal_col, self.device)
# state = env.grid
# if env.grid[2, i, j] == 0 and (i, j) != (env.goal_row, env.goal_col):
# state = env.grid.clone().detach()
# states = torch.stack((state, state, state, state))
# actions = torch.stack(env.actions)
# preds = self.model(states, actions)
# preds = list(preds.flatten())
# max_index = preds.index(max(preds))
# vec_map[i, j] = max_index
# vec_map[env.goal_row, env.goal_col] = -1
# return vec_map
for i in range(env.height):
packed_states = []
packed_actions = []
for j in range(env.width):
old_obs = env.grid[2,:,:]
env._init_from_grid(old_obs, i, j, env.goal_row, env.goal_col, self.device)
state = env.grid
if env.grid[2, i, j] == 0 and (i, j) != (env.goal_row, env.goal_col):
state = env.grid.clone().detach()
states = torch.stack((state, state, state, state))
actions = torch.stack(env.actions)
packed_states.append(states)
packed_actions.append(actions)
packed_states = torch.cat(packed_states, dim=0)
packed_actions = torch.cat(packed_actions, dim=0)
packed_preds = self.model(packed_states, packed_actions)
idx = 0
for j in range(env.width):
if env.grid[2, i, j] == 0 and (i, j) != (env.goal_row, env.goal_col):
preds = list(packed_preds[idx:idx+4].flatten())
max_index = preds.index(max(preds))
vec_map[i, j] = max_index
idx += 4
return vec_map
def _is_out_of_boundary(self, env, loc):
'''
Returns True if the location is out of the boundaries.
'''
if (loc.row >= env.height) or (loc.row < 0):
return True
if (loc.col >= env.width) or (loc.col < 0):
return True
return False
def _is_obstacle(self, env, loc):
'''
Returns True if the location is an obstacle.
'''
return env.grid[2, loc.row, loc.col] == 1
def _is_goal(self, env, loc):
'''
Returns True if the location is the goal.
'''
return env.grid[1, loc.row, loc.col] == 1
def _find_env_failure(self, env):
'''
Find the positions on the map from which the agent can't reach the goal.
'''
vec_map = self._gen_vec_map(env)
# Result map value: -1: can't reach goal, 0: not tested yet, 1: can reach goal
results = torch.full((env.height, env.width), 0, dtype=torch.int8, requires_grad=False)
failed_cnt = 0
failed_locs = []
for i in range(env.height):
for j in range(env.width):
cur_loc = Loc(i, j)
# This position has known result.
if results[i, j]: continue
# This position is an obstacle.
if self._is_obstacle(env, cur_loc): continue
# This position is the goal.
if self._is_goal(env, cur_loc): continue
# A visited map, used to detect looping.
visited = torch.full((env.height, env.width), False, dtype=torch.bool, requires_grad=False)
# Agent trajectory.
path = []
while not self._is_out_of_boundary(env, cur_loc) \
and not self._is_obstacle(env, cur_loc) \
and not self._is_goal(env, cur_loc) \
and not visited[cur_loc.row, cur_loc.col] \
and not results[cur_loc.row, cur_loc.col]:
visited[cur_loc.row, cur_loc.col] = True
path.append(cur_loc)
cur_loc = cur_loc.move(vec_map[cur_loc.row, cur_loc.col])
# If the agent runs into a boundary or an obstacle or forms a loop
# or a known position that will lead to the above in the future,
# mark all the positions in the trajectory as fail (-1).
if self._is_out_of_boundary(env, cur_loc) \
or self._is_obstacle(env, cur_loc) \
or visited[cur_loc.row, cur_loc.col] \
or results[cur_loc.row, cur_loc.col] < 0:
for loc in path:
results[loc.row, loc.col] = -1
# Otherwise, the agent must have reached the goal. Mark all the
# positions in the trajectory as success (1).
else:
for loc in path:
results[loc.row, loc.col] = 1
# Count the number of failed positions and record them.
for i in range(env.height):
for j in range(env.width):
if results[i, j].item() < 0:
failed_cnt += 1
failed_locs.append(Loc(i, j))
return failed_cnt / (env.height * env.width), failed_locs
def _choose_train_env(self):
'''
Choose the environment that has the lowest accuracy in the environment list.
'''
if not self.envs:
self._add_new_env()
fail_rates = []
fail_poses = []
# Check the fail rate and fail positions of all existing environments.
for env in self.envs:
fail_rate, fail_pos = self._find_env_failure(env)
fail_rates.append(fail_rate)
fail_poses.append(fail_pos)
worst_idx = np.argmax(fail_rates)
worst_rate = fail_rates[worst_idx]
# Check whether it's time to add more environments.
if 1 - worst_rate >= self.env_inc_acc:
# If we can still add more environments, add one and try again.
if len(self.envs) < self.max_env_num:
self._add_new_env()
self.train_state['inc_env'] = True
return self._choose_train_env()
# If we have reached the maximum number of environments, if
# all the environments have their accuracy above the threshold,
# stop the training.
if 1 - worst_rate >= self.env_final_acc:
return None
# Pick the worst performance environment.
worst_env = self.envs[worst_idx]
start_pos = np.random.choice(fail_poses[worst_idx])
return worst_env, start_pos, worst_rate
def test_envs(self):
'''
Return the fail rates of all environments.
'''
fail_rates = []
for env in self.envs:
fail_rate, _ = self._find_env_failure(env)
fail_rates.append(fail_rate)
return fail_rates
def train_once(self):
self.train_state = {
'finish': False,
'inc_env': False
}
# Choose the worst environment to train.
chosen_env_tup = self._choose_train_env()
if chosen_env_tup is None:
self.train_state['finish'] = True
return self.train_state
env, start_pos, fail_rate = chosen_env_tup
self.train_state['fail_rate'] = fail_rate
env.reset_to(start_pos.row, start_pos.col)
# Set epsilon based on the fail rate. Epsilon is larger when
# the fail rate is higher.
fail_rate **= 1/4
epsilon = self.epsilon_high * fail_rate + self.epsilon_low * (1 - fail_rate)
done = False
counter = 0
moves = []
imm_rewards = []
losses = []
# Walk through the map to reach the goal without updating the
# network. Record the path.
self.model.eval()
while counter < self.max_play_length and not done:
# For probability epsilon, choose a random action, otherwise
# choose the best action predicted by the network.
p = np.random.uniform(0, 1)
if p < epsilon:
choice = np.random.randint(0, 4)
else:
preds = []
state = env.grid.clone().detach()
states = torch.stack((state, state, state, state))
actions = torch.stack(env.actions)
preds = self.model(states, actions)
list_pred = [x.item() for x in preds]
max_pred = np.amax(list_pred)
max_positions = np.argwhere(
list_pred == max_pred).flatten().tolist()
choice = np.random.choice(max_positions)
# Take the step and save the immediate reward value.
_, imm_reward, done, _ = env.step(
choice, early_stop=False, q_learning=True)
moves.append(choice)
imm_rewards.append(imm_reward)
counter += 1
# If the agent successfully reached the goal, reversely update
# the network from the goal to the starting position.
if done:
self.model.train()
# Spetial treatment for the very last step, because reaching
# the goal only gives a positive immediate reward and no
# future reward.
cur_row, cur_col = env.goal_row, env.goal_col
cur_action, cur_reward = moves.pop(), imm_rewards.pop()
rev_action = ReverseAction(cur_action)
env.step(rev_action, early_stop=False, q_learning=True)
state = env.grid.clone().detach()
state = state.view(1, *state.shape)
action_vec = env.actions[cur_action]
action_vec = action_vec.view(1, *action_vec.shape)
self.optimizer.zero_grad()
pred_reward = self.model(state, action_vec)
real_reward = torch.Tensor([cur_reward]).to(self.device)
real_reward = real_reward.view(1, *real_reward.shape)
loss = self.loss_func(pred_reward, real_reward)
losses.append(loss)
loss.backward()
self.optimizer.step()
# Calculate new predicted future reward for the updating
# of the previous step.
with torch.no_grad():
next_preds = []
state = env.grid.clone().detach()
state = state.view(1, *state.shape)
for action in env.actions:
action = action.view(1, *action.shape)
next_pred = self.model(state, action)
next_preds.append(next_pred.item())
future_reward = max(next_preds)
# Go reversely through the path and update the network
# along the way.
while len(moves) > 0 and len(imm_rewards) > 0:
# Take a step backward and update the network.
cur_row, cur_col = env.goal_row, env.goal_col
cur_action, cur_reward = moves.pop(), imm_rewards.pop()
rev_action = ReverseAction(cur_action)
# If the reward was 0, it means that it wasn't stepping into
# the boundaries or obstacles, so we should perform a reverse
# step. Otherwise, we should stay at the same position.
if cur_reward == 0:
env.step(rev_action, early_stop=False, q_learning=True)
state = env.grid.clone().detach()
state = state.view(1, *state.shape)
action_vec = env.actions[cur_action]
action_vec = action_vec.view(1, *action_vec.shape)
self.optimizer.zero_grad()
pred_reward = self.model(state, action_vec)
real_reward = cur_reward + self.gamma * future_reward
real_reward = torch.Tensor([real_reward]).to(self.device)
real_reward = real_reward.view(1, *real_reward.shape)
loss = self.loss_func(pred_reward, real_reward)
losses.append(loss)
loss.backward()
self.optimizer.step()
# Calculate new predicted future reward for the updating
# of the previous step.
with torch.no_grad():
next_preds = []
state = env.grid.clone().detach()
state = state.view(1, *state.shape)
for action in env.actions:
action = action.view(1, *action.shape)
next_pred = self.model(state, action)
next_preds.append(next_pred.item())
future_reward = max(next_preds)
return self.train_state