def __init__(self, env, ckptLocBase, ckptName, isTraining, EP_MAX, GAMMA, A_LR, C_LR, ClippingEpsilon, UpdateDepth, L1Neurons, L2Neurons, LR_DECAY=1, LR_DECAY_FREQ=1000, SharedStorage=None): tf.reset_default_graph() # if SharedStorage is None, it must be in inference mode without "update()" self.SharedStorage = SharedStorage self.EP_MAX = EP_MAX self.GAMMA = GAMMA self.A_LR = A_LR self.C_LR = C_LR self.LR_DECAY = LR_DECAY self.LR_DECAY_FREQ = LR_DECAY_FREQ self.ClippingEpsilon = ClippingEpsilon self.UpdateDepth = UpdateDepth self.L1Neurons = L1Neurons self.L2Neurons = L2Neurons self.S_DIM = len(env.observation_space.low) self.A_DIM = env.action_space.n self.A_SPACE = 1 self.sess = tf.Session(graph=tf.get_default_graph()) self.tfs = tf.placeholder(tf.float32, [None, self.S_DIM], 'state') self.ckptLocBase = ckptLocBase self.UpdateStepFile = self.ckptLocBase + '/UpdateStep' self.ActorLrFile = self.ckptLocBase + '/ActorLrFile' self.CriticLrFile = self.ckptLocBase + '/CrticLrFile' hp.ColorPrint(Fore.LIGHTCYAN_EX, "Log dir={}".format(self.ckptLocBase)) self.ckptLoc = ckptLocBase + '/' + ckptName self.UpdateStep = 0 if not os.path.exists(self.ckptLocBase): os.makedirs(self.ckptLocBase) if os.path.exists(self.UpdateStepFile): with open(self.UpdateStepFile, 'r') as f: self.UpdateStep = int(f.read()) hp.ColorPrint(Fore.GREEN, "Restored episode step={}".format(self.UpdateStep)) if os.path.exists(self.ActorLrFile): with open(self.ActorLrFile, 'r') as f: self.A_LR = float(f.read()) hp.ColorPrint(Fore.GREEN, "Restored A_LR={}".format(self.A_LR)) else: with open(self.ActorLrFile, 'w') as f: f.write(str(self.A_LR)) if os.path.exists(self.CriticLrFile): with open(self.CriticLrFile, 'r') as f: self.C_LR = float(f.read()) hp.ColorPrint(Fore.GREEN, "Restored C_LR={}".format(self.C_LR)) else: with open(self.CriticLrFile, 'w') as f: f.write(str(self.C_LR)) if isTraining == 'N': self.isTraining = False hp.ColorPrint(Fore.LIGHTCYAN_EX, "This is inference procedure") else: self.isTraining = True hp.ColorPrint( Fore.LIGHTCYAN_EX, "This is training procedure with UpdateStep={}".format( self.UpdateStep)) # critic with tf.variable_scope('Critic'): with tf.variable_scope('Fully_Connected'): l1 = self.add_layer(self.tfs, self.L1Neurons, activation_function=tf.nn.relu, norm=True) if self.L2Neurons != 0: l2 = self.add_layer(l1, self.L2Neurons, activation_function=tf.nn.relu, norm=True) with tf.variable_scope('Value'): if self.L2Neurons != 0: self.v = tf.layers.dense(l2, 1) else: self.v = tf.layers.dense(l1, 1) with tf.variable_scope('Loss'): self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') self.advantage = self.tfdc_r - self.v self.closs = tf.reduce_mean(tf.square(self.advantage)) self.CriticLossSummary = tf.summary.scalar( 'CriticLoss', self.closs) with tf.variable_scope('CriticTrain'): self.ctrain_op = tf.train.AdamOptimizer(self.C_LR).minimize( self.closs) # pi: act_probs pi, pi_params = self._build_anet('Actor', trainable=True) oldpi, oldpi_params = self._build_anet('oldActor', trainable=False) # operation of choosing action with tf.variable_scope('ActionsExp.'): self.acts_expect = tf.squeeze(pi, axis=0) with tf.variable_scope('Update'): self.update_oldpi_op = [ oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params) ] with tf.variable_scope('Actor/PPO-Loss'): self.tfa = tf.placeholder(tf.int32, [None, 1], 'action') self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') # probabilities of actions which agent took with policy # depth=pi.shape[0] <-- each column is viewed as a vector # depth=pi.shape[1] <-- each row is viewed as a vector <-- we use this act_probs = pi * tf.one_hot(indices=self.tfa, depth=pi.shape[1]) act_probs = tf.reduce_sum(act_probs, axis=1) # probabilities of actions which old agent took with policy act_probs_old = oldpi * tf.one_hot(indices=self.tfa, depth=oldpi.shape[1]) act_probs_old = tf.reduce_sum(act_probs_old, axis=1) # add a small number to avoid NaN #ratio = tf.divide(act_probs + 1e-10, act_probs_old + 1e-10) ratio = tf.exp( tf.log(act_probs + 1e-10) - tf.log(act_probs_old + 1e-10)) surr = tf.multiply(ratio, self.tfadv) clip = tf.clip_by_value(ratio, 1. - self.ClippingEpsilon, 1. + self.ClippingEpsilon) * self.tfadv # clipped surrogate objective self.aloss = -tf.reduce_mean(tf.minimum(surr, clip)) # visualizing self.ppoRatioSummary = tf.summary.tensor_summary('ppoRatio', ratio) self.ActorLossSummary = tf.summary.scalar('ActorLoss', self.aloss) with tf.variable_scope('ActorTrain'): self.atrain_op = tf.train.AdamOptimizer(self.A_LR).minimize( self.aloss) with tf.variable_scope('Summary'): self.OverallSpeedup = tf.placeholder(tf.float32, name='OverallSpeedup') self.EpisodeReward = tf.placeholder(tf.float32, name='EpisodeReward') self.one = tf.constant(1.0, dtype=tf.float32) self.RecordSpeedup_op = tf.multiply(self.OverallSpeedup, self.one) self.SpeedupSummary = tf.summary.scalar('OverallSpeedup', self.RecordSpeedup_op) self.RecordEpiReward_op = tf.multiply(self.EpisodeReward, self.one) self.EpiRewardSummary = tf.summary.scalar('EpisodeReward', self.RecordEpiReward_op) self.writer = tf.summary.FileWriter(self.ckptLocBase, self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() ''' If the ckpt exist, restore it. ''' if tf.train.checkpoint_exists(self.ckptLoc): #self.saver.restore(self.sess, self.ckptLoc) self.saver.restore(self.sess, tf.train.latest_checkpoint(self.ckptLocBase)) hp.ColorPrint(Fore.LIGHTGREEN_EX, 'Restore the previous model.') elif self.isTraining == False: hp.ColorPrint(Fore.LIGHTRED_EX, "Missing trained model to inference, exit.") sys.exit(1)
def work(self): while not self.SharedStorage['Coordinator'].should_stop(): states, ResetInfo = self.env.reset() EpisodeReward = 0 buffer_s, buffer_a, buffer_r = {}, {}, {} MeanSigmaDict = calc.getCpuMeanSigmaInfo() FirstEpi = True PassHistory = {} while True: # while global PPO is updating if not self.SharedStorage['Events']['collect'].is_set(): # wait until PPO is updated self.SharedStorage['Events']['collect'].wait() ''' Save the last profiled info to calculate real rewards ''' try: if FirstEpi: oldCycles = ResetInfo["TotalCyclesStat"] oldInfo = ResetInfo FirstEpi = False isUsageNotProcessed = True else: oldCycles = info["TotalCyclesStat"] oldInfo = oldAllUsage isUsageNotProcessed = False except Exception as e: hp.ColorPrint( Fore.RED, "Exception happened and skipped: \n{}".format(e)) break if type(oldCycles) is not int: hp.ColorPrint(Fore.RED, "oldCycles is not int. skip this episode.") break ''' Choose the features from the most inflential function ''' state = calc.getMostInfluentialState(states, ResetInfo) action = self.ppo.choose_action(state, PassHistory) nextStates, reward, done, info = self.env.step(action) ''' If build failed, skip it. ''' if reward < 0: # clear history of applied passes PassHistory = {} hp.ColorPrint( Fore.RED, 'WorkerID={} env.step() Failed. Use new target and forget these memories' .format(self.wid)) break try: ''' Calculate actual rewards for all functions Note: The info may loss some part, re-send or abort this iteration. ''' rewards, oldAllUsage = calc.calcEachReward( info, MeanSigmaDict, nextStates, oldInfo, oldCycles, isUsageNotProcessed) ''' Speedup for tf.summary Skip this iteration, if the speedup/slowdown is not obvious ''' speedup = calc.calcOverallSpeedup(ResetInfo, info) """ if abs(speedup) < 0.0: # This result in bad evaluation. hp.ColorPrint(Fore.RED, "WorkerID={}, Speedup={} --> skip this iteration".format(self.wid, speedup)) if done: PassHistory = {} break else: states = nextStates continue """ ''' Match the states and rewards ''' AddedCount = calc.appendStateRewards( buffer_s, buffer_a, buffer_r, states, rewards, action) ''' Calculate overall reward for summary ''' EpisodeReward = calc.calcEpisodeReward(rewards) except Exception as e: hp.ColorPrint( Fore.LIGHTRED_EX, "Exception for receiving uncompleted data\n{}".format( e)) if done: # This may lose some data for training. PassHistory = {} break else: states = nextStates continue # add the generated results self.SharedStorage['Locks']['counter'].acquire() self.SharedStorage['Counters']['update_counter'] = \ self.SharedStorage['Counters']['update_counter'] + AddedCount self.SharedStorage['Locks']['counter'].release() if self.SharedStorage['Counters'][ 'update_counter'] >= MIN_BATCH_SIZE or done: ''' Calculate discounted rewards for all functions ''' discounted_r = calc.calcDiscountedRewards( buffer_r, nextStates, self.ppo) ''' Convert dict of list into row-array ''' vstack_s, vstack_a, vstack_r = calc.DictToVstack( buffer_s, buffer_a, discounted_r) ''' Remove data that are not important in the batch ''' vstack_s, vstack_a, vstack_r, delCount = \ calc.RemoveTrivialData(vstack_s, vstack_a, vstack_r, AbandonRatio=20) hp.ColorPrint( Fore.GREEN, "Throw away {} data in this batch".format(delCount)) self.SharedStorage['Locks']['counter'].acquire() self.SharedStorage['Counters']['update_counter'] = \ self.SharedStorage['Counters']['update_counter'] - delCount self.SharedStorage['Locks']['counter'].release() ''' Split each of vector and assemble into a queue element. ''' self.SharedStorage['Locks']['queue'].acquire() # put data in the shared queue for index, item in enumerate(vstack_s): self.SharedStorage['DataQueue'].put( np.hstack((vstack_s[index], vstack_a[index], vstack_r[index]))) self.SharedStorage['Locks']['queue'].release() buffer_s, buffer_a, buffer_r = {}, {}, {} # stop collecting data self.SharedStorage['Events']['collect'].clear() # globalPPO update self.SharedStorage['Events']['update'].set() if self.SharedStorage['Counters']['ep'] >= EP_MAX: # stop training self.SharedStorage['Coordinator'].request_stop() hp.ColorPrint( Fore.RED, 'WorkerID={} calls to Stop'.format(self.wid)) break if not done: states = nextStates continue else: # clear history of applied passes PassHistory = {} # record reward changes, plot later self.SharedStorage['Locks']['plot_epi'].acquire() # add episode count self.SharedStorage['Counters']['ep'] += 1 ''' draw to tensorboard ''' self.ppo.DrawToTf(speedup, EpisodeReward, self.SharedStorage['Counters']['ep']) self.SharedStorage['Locks']['plot_epi'].release() msg = '{0:}/{1:} ({2:.1f}%)'.format( self.SharedStorage['Counters']['ep'], EP_MAX, self.SharedStorage['Counters']['ep'] / EP_MAX * 100) + ' | WorkerID={}'.format( self.wid) + '\nEpisodeReward: {0:.4f}'.format( EpisodeReward ) + ' | OverallSpeedup: {}'.format(speedup) hp.ColorPrint(Fore.GREEN, msg) break hp.ColorPrint(Fore.YELLOW, 'WorkerID={} stopped'.format(self.wid))
def update(self): while not self.SharedStorage['Coordinator'].should_stop(): if self.SharedStorage['Counters']['ep'] < self.EP_MAX: # blocking wait until get batch of data self.SharedStorage['Events']['update'].wait() # save the model if self.UpdateStep % 50 == 0: self.save() hp.ColorPrint(Fore.LIGHTRED_EX, "Save for every 50 updates.") else: hp.ColorPrint( Fore.LIGHTBLUE_EX, "This update does not need to be saved: {}".format( self.UpdateStep)) # learning rate decay if self.UpdateStep % self.LR_DECAY_FREQ == ( self.LR_DECAY_FREQ - 1): # decay self.A_LR = self.A_LR * self.LR_DECAY self.C_LR = self.C_LR * self.LR_DECAY # save with open(self.ActorLrFile, 'w') as f: f.write(str(self.A_LR)) with open(self.CriticLrFile, 'w') as f: f.write(str(self.C_LR)) hp.ColorPrint( Fore.LIGHTRED_EX, "Decay LR: A_LR={}, C_LR={}".format( self.A_LR, self.C_LR)) # copy pi to old pi self.sess.run(self.update_oldpi_op) # collect data from all workers data = [ self.SharedStorage['DataQueue'].get() for _ in range(self.SharedStorage['DataQueue'].qsize()) ] data = np.vstack(data) s, a, r = data[:, :self.S_DIM], data[:, self.S_DIM:self.S_DIM + self.A_SPACE], data[:, -1:] adv = self.sess.run(self.advantage, { self.tfs: s, self.tfdc_r: r }) # update actor and critic in a update loop for _ in range(self.UpdateDepth): self.sess.run(self.atrain_op, { self.tfs: s, self.tfa: a, self.tfadv: adv }) self.sess.run(self.ctrain_op, { self.tfs: s, self.tfdc_r: r }) ''' write summary ''' # actor and critic loss result = self.sess.run(tf.summary.merge([ self.ActorLossSummary, self.CriticLossSummary, self.ppoRatioSummary ]), feed_dict={ self.tfs: s, self.tfa: a, self.tfadv: adv, self.tfdc_r: r }) self.writer.add_summary(result, self.UpdateStep) self.UpdateStep += 1 # re-train will not overlap the summaries with open(self.UpdateStepFile, 'w') as f: f.write(str(self.UpdateStep)) # updating finished self.SharedStorage['Events']['update'].clear() self.SharedStorage['Locks']['counter'].acquire() # reset counter self.SharedStorage['Counters']['update_counter'] = 0 self.SharedStorage['Locks']['counter'].release() # set collecting available self.SharedStorage['Events']['collect'].set() hp.ColorPrint(Fore.YELLOW, 'Updator stopped')
'-t', '--training', type=str, nargs='?', default='Y', help= 'Is this run will be training procedure?\n\"Y\"=Training, \"N\"=Inference', required=False) args = vars(parser.parse_args()) # restore the episode count EpiStepFile = args['logdir'] + '/EpiStepFile' if os.path.exists(EpiStepFile): with open(EpiStepFile, 'r') as f: SharedStorage['Counters']['ep'] = int(f.read()) hp.ColorPrint( Fore.LIGHTCYAN_EX, "Restore Episode:{}".format(SharedStorage['Counters']['ep'])) GlobalPPO = PPPO.PPO(gym.make(Game).unwrapped, args['logdir'], 'model.ckpt', isTraining=args['training'], SharedStorage=SharedStorage, EP_MAX=EP_MAX, GAMMA=GAMMA, A_LR=A_LR, C_LR=C_LR, LR_DECAY=LR_DECAY, LR_DECAY_FREQ=LR_DECAY_FREQ, ClippingEpsilon=ClippingEpsilon, L1Neurons=L1Neurons,