/
PPPO.py
350 lines (327 loc) · 15.9 KB
/
PPPO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#!/usr/bin/env python3
"""
Refer to the work of OpenAI and DeepMind.
Algorithm:
OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347]
Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
Dependencies:
tensorflow
gym
gym_OptClang
Thanks to MorvanZhou's implementation: https://morvanzhou.github.io/tutorials
The basic structure is derived from him.
However, the internal structure is tuned for gym_OptClang.
"""
import tensorflow as tf
import numpy as np
import matplotlib
# do not use x-server
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import gym, gym_OptClang
import random, threading, queue, operator, os, sys, re
from operator import itemgetter
from random import shuffle
import random
from colorama import Fore, Style
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import time
import io
from time import gmtime, strftime
import argparse
import pytz
import Helpers as hp
class PPO(object):
def __init__(self, env, ckptLocBase, ckptName, isTraining, EP_MAX, GAMMA, A_LR, C_LR, ClippingEpsilon, UpdateDepth, L1Neurons, L2Neurons, LR_DECAY=1, LR_DECAY_FREQ=1000,SharedStorage=None):
tf.reset_default_graph()
# if SharedStorage is None, it must be in inference mode without "update()"
self.SharedStorage = SharedStorage
self.EP_MAX = EP_MAX
self.GAMMA = GAMMA
self.A_LR = A_LR
self.C_LR = C_LR
self.LR_DECAY = LR_DECAY
self.LR_DECAY_FREQ = LR_DECAY_FREQ
self.ClippingEpsilon = ClippingEpsilon
self.UpdateDepth = UpdateDepth
self.L1Neurons = L1Neurons
self.L2Neurons = L2Neurons
self.S_DIM = len(env.observation_space.low)
self.A_DIM = env.action_space.n
self.A_SPACE = 1
self.sess = tf.Session(graph=tf.get_default_graph())
self.tfs = tf.placeholder(tf.float32, [None, self.S_DIM], 'state')
self.ckptLocBase = ckptLocBase
self.UpdateStepFile = self.ckptLocBase + '/UpdateStep'
self.ActorLrFile = self.ckptLocBase + '/ActorLrFile'
self.CriticLrFile = self.ckptLocBase + '/CrticLrFile'
hp.ColorPrint(Fore.LIGHTCYAN_EX, "Log dir={}".format(self.ckptLocBase))
self.ckptLoc = ckptLocBase + '/' + ckptName
self.UpdateStep = 0
if not os.path.exists(self.ckptLocBase):
os.makedirs(self.ckptLocBase)
if os.path.exists(self.UpdateStepFile):
with open(self.UpdateStepFile, 'r') as f:
self.UpdateStep = int(f.read())
hp.ColorPrint(Fore.GREEN, "Restored episode step={}".format(self.UpdateStep))
if os.path.exists(self.ActorLrFile):
with open(self.ActorLrFile, 'r') as f:
self.A_LR = float(f.read())
hp.ColorPrint(Fore.GREEN, "Restored A_LR={}".format(self.A_LR))
else:
with open(self.ActorLrFile, 'w') as f:
f.write(str(self.A_LR))
if os.path.exists(self.CriticLrFile):
with open(self.CriticLrFile, 'r') as f:
self.C_LR = float(f.read())
hp.ColorPrint(Fore.GREEN, "Restored C_LR={}".format(self.C_LR))
else:
with open(self.CriticLrFile, 'w') as f:
f.write(str(self.C_LR))
if isTraining == 'N':
self.isTraining = False
hp.ColorPrint(Fore.LIGHTCYAN_EX, "This is inference procedure")
else:
self.isTraining = True
hp.ColorPrint(Fore.LIGHTCYAN_EX, "This is training procedure with UpdateStep={}".format(self.UpdateStep))
# critic
with tf.variable_scope('Critic'):
with tf.variable_scope('Fully_Connected'):
l1 = self.add_layer(self.tfs, self.L1Neurons, activation_function=tf.nn.relu, norm=True)
if self.L2Neurons != 0:
l2 = self.add_layer(l1, self.L2Neurons, activation_function=tf.nn.relu, norm=True)
with tf.variable_scope('Value'):
if self.L2Neurons != 0:
self.v = tf.layers.dense(l2, 1)
else:
self.v = tf.layers.dense(l1, 1)
with tf.variable_scope('Loss'):
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.CriticLossSummary = tf.summary.scalar('CriticLoss', self.closs)
with tf.variable_scope('CriticTrain'):
self.ctrain_op = tf.train.AdamOptimizer(self.C_LR).minimize(self.closs)
# pi: act_probs
pi, pi_params = self._build_anet('Actor', trainable=True)
oldpi, oldpi_params = self._build_anet('oldActor', trainable=False)
# operation of choosing action
with tf.variable_scope('ActionsExp.'):
self.acts_expect = tf.squeeze(pi, axis=0)
with tf.variable_scope('Update'):
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
with tf.variable_scope('Actor/PPO-Loss'):
self.tfa = tf.placeholder(tf.int32, [None, 1], 'action')
self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
# probabilities of actions which agent took with policy
# depth=pi.shape[0] <-- each column is viewed as a vector
# depth=pi.shape[1] <-- each row is viewed as a vector <-- we use this
act_probs = pi * tf.one_hot(indices=self.tfa, depth=pi.shape[1])
act_probs = tf.reduce_sum(act_probs, axis=1)
# probabilities of actions which old agent took with policy
act_probs_old = oldpi * tf.one_hot(indices=self.tfa, depth=oldpi.shape[1])
act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
# add a small number to avoid NaN
#ratio = tf.divide(act_probs + 1e-10, act_probs_old + 1e-10)
ratio = tf.exp(tf.log(act_probs + 1e-10) - tf.log(act_probs_old + 1e-10))
surr = tf.multiply(ratio, self.tfadv)
clip = tf.clip_by_value(ratio, 1.-self.ClippingEpsilon, 1.+self.ClippingEpsilon)*self.tfadv
# clipped surrogate objective
self.aloss = -tf.reduce_mean(tf.minimum(surr, clip))
# visualizing
self.ppoRatioSummary = tf.summary.tensor_summary('ppoRatio', ratio)
self.ActorLossSummary = tf.summary.scalar('ActorLoss', self.aloss)
with tf.variable_scope('ActorTrain'):
self.atrain_op = tf.train.AdamOptimizer(self.A_LR).minimize(self.aloss)
with tf.variable_scope('Summary'):
self.OverallSpeedup = tf.placeholder(tf.float32, name='OverallSpeedup')
self.EpisodeReward = tf.placeholder(tf.float32, name='EpisodeReward')
self.one = tf.constant(1.0, dtype=tf.float32)
self.RecordSpeedup_op = tf.multiply(self.OverallSpeedup, self.one)
self.SpeedupSummary = tf.summary.scalar('OverallSpeedup', self.RecordSpeedup_op)
self.RecordEpiReward_op = tf.multiply(self.EpisodeReward, self.one)
self.EpiRewardSummary = tf.summary.scalar('EpisodeReward', self.RecordEpiReward_op)
self.writer = tf.summary.FileWriter(self.ckptLocBase, self.sess.graph)
self.sess.run(tf.global_variables_initializer())
self.saver = tf.train.Saver()
'''
If the ckpt exist, restore it.
'''
if tf.train.checkpoint_exists(self.ckptLoc):
#self.saver.restore(self.sess, self.ckptLoc)
self.saver.restore(self.sess, tf.train.latest_checkpoint(self.ckptLocBase))
hp.ColorPrint(Fore.LIGHTGREEN_EX, 'Restore the previous model.')
elif self.isTraining == False:
hp.ColorPrint(Fore.LIGHTRED_EX, "Missing trained model to inference, exit.")
sys.exit(1)
def save(self):
"""
Save model
"""
self.saver.save(self.sess, self.ckptLoc)
def update(self):
while not self.SharedStorage['Coordinator'].should_stop():
if self.SharedStorage['Counters']['ep'] < self.EP_MAX:
# blocking wait until get batch of data
self.SharedStorage['Events']['update'].wait()
# save the model
if self.UpdateStep % 50 == 0:
self.save()
hp.ColorPrint(Fore.LIGHTRED_EX, "Save for every 50 updates.")
else:
hp.ColorPrint(Fore.LIGHTBLUE_EX,
"This update does not need to be saved: {}".format(self.UpdateStep))
# learning rate decay
if self.UpdateStep % self.LR_DECAY_FREQ == (self.LR_DECAY_FREQ-1):
# decay
self.A_LR = self.A_LR * self.LR_DECAY
self.C_LR = self.C_LR * self.LR_DECAY
# save
with open(self.ActorLrFile, 'w') as f:
f.write(str(self.A_LR))
with open(self.CriticLrFile, 'w') as f:
f.write(str(self.C_LR))
hp.ColorPrint(Fore.LIGHTRED_EX,
"Decay LR: A_LR={}, C_LR={}".format(self.A_LR, self.C_LR))
# copy pi to old pi
self.sess.run(self.update_oldpi_op)
# collect data from all workers
data = [self.SharedStorage['DataQueue'].get() for _ in range(self.SharedStorage['DataQueue'].qsize())]
data = np.vstack(data)
s, a, r = data[:, :self.S_DIM], data[:, self.S_DIM: self.S_DIM + self.A_SPACE], data[:, -1:]
adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
# update actor and critic in a update loop
for _ in range(self.UpdateDepth):
self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv})
self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r})
'''
write summary
'''
# actor and critic loss
result = self.sess.run(
tf.summary.merge([self.ActorLossSummary, self.CriticLossSummary,
self.ppoRatioSummary]),
feed_dict={self.tfs: s, self.tfa: a, self.tfadv: adv, self.tfdc_r: r})
self.writer.add_summary(result, self.UpdateStep)
self.UpdateStep += 1
# re-train will not overlap the summaries
with open(self.UpdateStepFile, 'w') as f:
f.write(str(self.UpdateStep))
# updating finished
self.SharedStorage['Events']['update'].clear()
self.SharedStorage['Locks']['counter'].acquire()
# reset counter
self.SharedStorage['Counters']['update_counter'] = 0
self.SharedStorage['Locks']['counter'].release()
# set collecting available
self.SharedStorage['Events']['collect'].set()
hp.ColorPrint(Fore.YELLOW, 'Updator stopped')
def add_layer(self, inputs, out_size, trainable=True,activation_function=None, norm=False):
in_size = inputs.get_shape().as_list()[1]
Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=1.0, stddev=1.0), trainable=trainable)
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1, trainable=trainable)
# fully connected product
Wx_plus_b = tf.matmul(inputs, Weights) + biases
# normalize fully connected product
if norm:
# Batch Normalize
Wx_plus_b = tf.contrib.layers.batch_norm(
Wx_plus_b, updates_collections=None, is_training=self.isTraining)
# activation
if activation_function is None:
outputs = Wx_plus_b
else:
with tf.variable_scope('ActivationFunction'):
outputs = activation_function(Wx_plus_b)
return outputs
def _build_anet(self, name, trainable):
with tf.variable_scope(name):
with tf.variable_scope('Fully_Connected'):
l1 = self.add_layer(self.tfs, self.L1Neurons, trainable,activation_function=tf.nn.relu, norm=True)
if self.L2Neurons != 0:
l2 = self.add_layer(l1, self.L2Neurons, trainable,activation_function=tf.nn.relu, norm=True)
with tf.variable_scope('Action_Expectation'):
# softmax may lead to NaN
if self.L2Neurons != 0:
expectation = \
self.add_layer(l2, self.A_DIM, activation_function=tf.nn.softmax, norm=True)
else:
expectation = \
self.add_layer(l1, self.A_DIM, activation_function=tf.nn.softmax, norm=True)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return expectation, params
def choose_action(self, s, PassHistory):
"""
return a int from 0 to 33
Input "s" must be numpy array.
In the world of reinforcement learning, the action space is from 0 to 33.
However, in the world of modified-clang, the accepted passes are from 1 to 34.
Therefore, "gym-OptClang" already done this effort for us.
We don't have to be bothered by this.
However, if you use the model withou gym-OptClang, you have to convert by yourself.
e.g. Inference example in our examples.
"""
s = s[np.newaxis, :]
a_expect = self.sess.run(self.acts_expect, {self.tfs: s})
print(a_expect)
'''
choose the one that was not applied yet
'''
# split the probabilities into list of [index ,probablities]
aList = a_expect.tolist()
probList = []
idx = 0
for prob in aList:
probList.append([idx, prob])
idx += 1
# some probs may be the same.
# Try to avoid that every time choose the same action
if self.isTraining == True:
shuffle(probList)
# sort with probs in descending order
probList.sort(key=itemgetter(1), reverse=True)
# find the one that is not applied yet
idx = 0
while True:
'''
During training, we need some chance to get unexpected action to let
the agent face different conditions as much as possible.
'''
# Use different strategies for different situations
if self.isTraining == True:
prob = random.uniform(0, 1)
if prob < 0.8:
# the most possible action
PassIdx = probList[idx][0]
idx += 1
else:
# random action
PassIdx = np.random.choice(np.arange(self.A_DIM))
else:
PassIdx = probList[idx][0]
idx += 1
#print('PassIdx={} with {} prob'.format(PassIdx, actionProb[1]))
if PassIdx not in PassHistory:
PassHistory[PassIdx] = 'Used'
return PassIdx
# the code should never come to here
return 'Error'
def get_v(self, s):
if s.ndim < 2: s = s[np.newaxis, :]
return self.sess.run(self.v, {self.tfs: s})[0, 0]
def DrawToTf(self, speedup, overall_reward, step):
"""
This is not thread-safe
"""
try:
result = self.sess.run(
tf.summary.merge([self.SpeedupSummary, self.EpiRewardSummary]),
feed_dict={self.OverallSpeedup: speedup,
self.EpisodeReward: overall_reward})
self.writer.add_summary(result, step)
with open(self.ckptLocBase + '/EpiStepFile', 'w') as f:
f.write(str(step))
self.writer.flush()
except Exception as e:
ColorPrint(Fore.LIGHTRED_EX, "SpeedupSummary or EpiRewardSummary failed: {}".fomat(e))