示例#1
0
    def __init__(self,
                 env,
                 ckptLocBase,
                 ckptName,
                 isTraining,
                 EP_MAX,
                 GAMMA,
                 A_LR,
                 C_LR,
                 ClippingEpsilon,
                 UpdateDepth,
                 L1Neurons,
                 L2Neurons,
                 LR_DECAY=1,
                 LR_DECAY_FREQ=1000,
                 SharedStorage=None):
        tf.reset_default_graph()
        # if SharedStorage is None, it must be in inference mode without "update()"
        self.SharedStorage = SharedStorage
        self.EP_MAX = EP_MAX
        self.GAMMA = GAMMA
        self.A_LR = A_LR
        self.C_LR = C_LR
        self.LR_DECAY = LR_DECAY
        self.LR_DECAY_FREQ = LR_DECAY_FREQ
        self.ClippingEpsilon = ClippingEpsilon
        self.UpdateDepth = UpdateDepth
        self.L1Neurons = L1Neurons
        self.L2Neurons = L2Neurons
        self.S_DIM = len(env.observation_space.low)
        self.A_DIM = env.action_space.n
        self.A_SPACE = 1
        self.sess = tf.Session(graph=tf.get_default_graph())
        self.tfs = tf.placeholder(tf.float32, [None, self.S_DIM], 'state')
        self.ckptLocBase = ckptLocBase
        self.UpdateStepFile = self.ckptLocBase + '/UpdateStep'
        self.ActorLrFile = self.ckptLocBase + '/ActorLrFile'
        self.CriticLrFile = self.ckptLocBase + '/CrticLrFile'
        hp.ColorPrint(Fore.LIGHTCYAN_EX, "Log dir={}".format(self.ckptLocBase))
        self.ckptLoc = ckptLocBase + '/' + ckptName
        self.UpdateStep = 0
        if not os.path.exists(self.ckptLocBase):
            os.makedirs(self.ckptLocBase)
        if os.path.exists(self.UpdateStepFile):
            with open(self.UpdateStepFile, 'r') as f:
                self.UpdateStep = int(f.read())
            hp.ColorPrint(Fore.GREEN,
                          "Restored episode step={}".format(self.UpdateStep))
        if os.path.exists(self.ActorLrFile):
            with open(self.ActorLrFile, 'r') as f:
                self.A_LR = float(f.read())
            hp.ColorPrint(Fore.GREEN, "Restored A_LR={}".format(self.A_LR))
        else:
            with open(self.ActorLrFile, 'w') as f:
                f.write(str(self.A_LR))
        if os.path.exists(self.CriticLrFile):
            with open(self.CriticLrFile, 'r') as f:
                self.C_LR = float(f.read())
            hp.ColorPrint(Fore.GREEN, "Restored C_LR={}".format(self.C_LR))
        else:
            with open(self.CriticLrFile, 'w') as f:
                f.write(str(self.C_LR))

        if isTraining == 'N':
            self.isTraining = False
            hp.ColorPrint(Fore.LIGHTCYAN_EX, "This is inference procedure")
        else:
            self.isTraining = True
            hp.ColorPrint(
                Fore.LIGHTCYAN_EX,
                "This is training procedure with UpdateStep={}".format(
                    self.UpdateStep))

        # critic
        with tf.variable_scope('Critic'):
            with tf.variable_scope('Fully_Connected'):
                l1 = self.add_layer(self.tfs,
                                    self.L1Neurons,
                                    activation_function=tf.nn.relu,
                                    norm=True)
                if self.L2Neurons != 0:
                    l2 = self.add_layer(l1,
                                        self.L2Neurons,
                                        activation_function=tf.nn.relu,
                                        norm=True)
            with tf.variable_scope('Value'):
                if self.L2Neurons != 0:
                    self.v = tf.layers.dense(l2, 1)
                else:
                    self.v = tf.layers.dense(l1, 1)
            with tf.variable_scope('Loss'):
                self.tfdc_r = tf.placeholder(tf.float32, [None, 1],
                                             'discounted_r')
                self.advantage = self.tfdc_r - self.v
                self.closs = tf.reduce_mean(tf.square(self.advantage))
                self.CriticLossSummary = tf.summary.scalar(
                    'CriticLoss', self.closs)
            with tf.variable_scope('CriticTrain'):
                self.ctrain_op = tf.train.AdamOptimizer(self.C_LR).minimize(
                    self.closs)

        # pi: act_probs
        pi, pi_params = self._build_anet('Actor', trainable=True)
        oldpi, oldpi_params = self._build_anet('oldActor', trainable=False)
        # operation of choosing action
        with tf.variable_scope('ActionsExp.'):
            self.acts_expect = tf.squeeze(pi, axis=0)
        with tf.variable_scope('Update'):
            self.update_oldpi_op = [
                oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)
            ]

        with tf.variable_scope('Actor/PPO-Loss'):
            self.tfa = tf.placeholder(tf.int32, [None, 1], 'action')
            self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
            # probabilities of actions which agent took with policy
            # depth=pi.shape[0] <-- each column is viewed as a vector
            # depth=pi.shape[1] <-- each row is viewed as a vector <-- we use this
            act_probs = pi * tf.one_hot(indices=self.tfa, depth=pi.shape[1])
            act_probs = tf.reduce_sum(act_probs, axis=1)
            # probabilities of actions which old agent took with policy
            act_probs_old = oldpi * tf.one_hot(indices=self.tfa,
                                               depth=oldpi.shape[1])
            act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
            # add a small number to avoid NaN
            #ratio = tf.divide(act_probs + 1e-10, act_probs_old + 1e-10)
            ratio = tf.exp(
                tf.log(act_probs + 1e-10) - tf.log(act_probs_old + 1e-10))
            surr = tf.multiply(ratio, self.tfadv)
            clip = tf.clip_by_value(ratio, 1. - self.ClippingEpsilon,
                                    1. + self.ClippingEpsilon) * self.tfadv
            # clipped surrogate objective
            self.aloss = -tf.reduce_mean(tf.minimum(surr, clip))
            # visualizing
            self.ppoRatioSummary = tf.summary.tensor_summary('ppoRatio', ratio)
            self.ActorLossSummary = tf.summary.scalar('ActorLoss', self.aloss)

        with tf.variable_scope('ActorTrain'):
            self.atrain_op = tf.train.AdamOptimizer(self.A_LR).minimize(
                self.aloss)

        with tf.variable_scope('Summary'):
            self.OverallSpeedup = tf.placeholder(tf.float32,
                                                 name='OverallSpeedup')
            self.EpisodeReward = tf.placeholder(tf.float32,
                                                name='EpisodeReward')
            self.one = tf.constant(1.0, dtype=tf.float32)
            self.RecordSpeedup_op = tf.multiply(self.OverallSpeedup, self.one)
            self.SpeedupSummary = tf.summary.scalar('OverallSpeedup',
                                                    self.RecordSpeedup_op)
            self.RecordEpiReward_op = tf.multiply(self.EpisodeReward, self.one)
            self.EpiRewardSummary = tf.summary.scalar('EpisodeReward',
                                                      self.RecordEpiReward_op)

        self.writer = tf.summary.FileWriter(self.ckptLocBase, self.sess.graph)
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        '''
        If the ckpt exist, restore it.
        '''
        if tf.train.checkpoint_exists(self.ckptLoc):
            #self.saver.restore(self.sess, self.ckptLoc)
            self.saver.restore(self.sess,
                               tf.train.latest_checkpoint(self.ckptLocBase))
            hp.ColorPrint(Fore.LIGHTGREEN_EX, 'Restore the previous model.')
        elif self.isTraining == False:
            hp.ColorPrint(Fore.LIGHTRED_EX,
                          "Missing trained model to inference, exit.")
            sys.exit(1)
示例#2
0
    def work(self):
        while not self.SharedStorage['Coordinator'].should_stop():
            states, ResetInfo = self.env.reset()
            EpisodeReward = 0
            buffer_s, buffer_a, buffer_r = {}, {}, {}
            MeanSigmaDict = calc.getCpuMeanSigmaInfo()
            FirstEpi = True
            PassHistory = {}
            while True:
                # while global PPO is updating
                if not self.SharedStorage['Events']['collect'].is_set():
                    # wait until PPO is updated
                    self.SharedStorage['Events']['collect'].wait()
                '''
                Save the last profiled info to calculate real rewards
                '''
                try:
                    if FirstEpi:
                        oldCycles = ResetInfo["TotalCyclesStat"]
                        oldInfo = ResetInfo
                        FirstEpi = False
                        isUsageNotProcessed = True
                    else:
                        oldCycles = info["TotalCyclesStat"]
                        oldInfo = oldAllUsage
                        isUsageNotProcessed = False
                except Exception as e:
                    hp.ColorPrint(
                        Fore.RED,
                        "Exception happened and skipped: \n{}".format(e))
                    break
                if type(oldCycles) is not int:
                    hp.ColorPrint(Fore.RED,
                                  "oldCycles is not int. skip this episode.")
                    break
                '''
                Choose the features from the most inflential function
                '''
                state = calc.getMostInfluentialState(states, ResetInfo)
                action = self.ppo.choose_action(state, PassHistory)
                nextStates, reward, done, info = self.env.step(action)
                '''
                If build failed, skip it.
                '''
                if reward < 0:
                    # clear history of applied passes
                    PassHistory = {}
                    hp.ColorPrint(
                        Fore.RED,
                        'WorkerID={} env.step() Failed. Use new target and forget these memories'
                        .format(self.wid))
                    break
                try:
                    '''
                    Calculate actual rewards for all functions
                    Note: The info may loss some part, re-send or abort this iteration.
                    '''
                    rewards, oldAllUsage = calc.calcEachReward(
                        info, MeanSigmaDict, nextStates, oldInfo, oldCycles,
                        isUsageNotProcessed)
                    '''
                    Speedup for tf.summary
                    Skip this iteration, if the speedup/slowdown is not obvious
                    '''
                    speedup = calc.calcOverallSpeedup(ResetInfo, info)
                    """
                    if abs(speedup) < 0.0:
                        # This result in bad evaluation.
                        hp.ColorPrint(Fore.RED,
                                "WorkerID={}, Speedup={} --> skip this iteration".format(self.wid, speedup))
                        if done:
                            PassHistory = {}
                            break
                        else:
                            states = nextStates
                            continue
                    """
                    '''
                    Match the states and rewards
                    '''
                    AddedCount = calc.appendStateRewards(
                        buffer_s, buffer_a, buffer_r, states, rewards, action)
                    '''
                    Calculate overall reward for summary
                    '''
                    EpisodeReward = calc.calcEpisodeReward(rewards)
                except Exception as e:
                    hp.ColorPrint(
                        Fore.LIGHTRED_EX,
                        "Exception for receiving uncompleted data\n{}".format(
                            e))
                    if done:
                        # This may lose some data for training.
                        PassHistory = {}
                        break
                    else:
                        states = nextStates
                        continue

                # add the generated results
                self.SharedStorage['Locks']['counter'].acquire()
                self.SharedStorage['Counters']['update_counter'] = \
                    self.SharedStorage['Counters']['update_counter'] + AddedCount
                self.SharedStorage['Locks']['counter'].release()
                if self.SharedStorage['Counters'][
                        'update_counter'] >= MIN_BATCH_SIZE or done:
                    '''
                    Calculate discounted rewards for all functions
                    '''
                    discounted_r = calc.calcDiscountedRewards(
                        buffer_r, nextStates, self.ppo)
                    '''
                    Convert dict of list into row-array
                    '''
                    vstack_s, vstack_a, vstack_r = calc.DictToVstack(
                        buffer_s, buffer_a, discounted_r)
                    '''
                    Remove data that are not important in the batch
                    '''
                    vstack_s, vstack_a, vstack_r, delCount = \
                            calc.RemoveTrivialData(vstack_s, vstack_a, vstack_r, AbandonRatio=20)
                    hp.ColorPrint(
                        Fore.GREEN,
                        "Throw away {} data in this batch".format(delCount))
                    self.SharedStorage['Locks']['counter'].acquire()
                    self.SharedStorage['Counters']['update_counter'] = \
                        self.SharedStorage['Counters']['update_counter'] - delCount
                    self.SharedStorage['Locks']['counter'].release()
                    '''
                    Split each of vector and assemble into a queue element.
                    '''
                    self.SharedStorage['Locks']['queue'].acquire()
                    # put data in the shared queue
                    for index, item in enumerate(vstack_s):
                        self.SharedStorage['DataQueue'].put(
                            np.hstack((vstack_s[index], vstack_a[index],
                                       vstack_r[index])))
                    self.SharedStorage['Locks']['queue'].release()
                    buffer_s, buffer_a, buffer_r = {}, {}, {}

                    # stop collecting data
                    self.SharedStorage['Events']['collect'].clear()
                    # globalPPO update
                    self.SharedStorage['Events']['update'].set()

                    if self.SharedStorage['Counters']['ep'] >= EP_MAX:
                        # stop training
                        self.SharedStorage['Coordinator'].request_stop()
                        hp.ColorPrint(
                            Fore.RED,
                            'WorkerID={} calls to Stop'.format(self.wid))
                        break
                if not done:
                    states = nextStates
                    continue
                else:
                    # clear history of applied passes
                    PassHistory = {}
                    # record reward changes, plot later
                    self.SharedStorage['Locks']['plot_epi'].acquire()
                    # add episode count
                    self.SharedStorage['Counters']['ep'] += 1
                    '''
                    draw to tensorboard
                    '''
                    self.ppo.DrawToTf(speedup, EpisodeReward,
                                      self.SharedStorage['Counters']['ep'])
                    self.SharedStorage['Locks']['plot_epi'].release()
                    msg = '{0:}/{1:} ({2:.1f}%)'.format(
                        self.SharedStorage['Counters']['ep'], EP_MAX,
                        self.SharedStorage['Counters']['ep'] / EP_MAX *
                        100) + ' | WorkerID={}'.format(
                            self.wid) + '\nEpisodeReward: {0:.4f}'.format(
                                EpisodeReward
                            ) + ' | OverallSpeedup: {}'.format(speedup)
                    hp.ColorPrint(Fore.GREEN, msg)
                    break
        hp.ColorPrint(Fore.YELLOW, 'WorkerID={} stopped'.format(self.wid))
示例#3
0
    def update(self):
        while not self.SharedStorage['Coordinator'].should_stop():
            if self.SharedStorage['Counters']['ep'] < self.EP_MAX:
                # blocking wait until get batch of data
                self.SharedStorage['Events']['update'].wait()
                # save the model
                if self.UpdateStep % 50 == 0:
                    self.save()
                    hp.ColorPrint(Fore.LIGHTRED_EX,
                                  "Save for every 50 updates.")
                else:
                    hp.ColorPrint(
                        Fore.LIGHTBLUE_EX,
                        "This update does not need to be saved: {}".format(
                            self.UpdateStep))
                # learning rate decay
                if self.UpdateStep % self.LR_DECAY_FREQ == (
                        self.LR_DECAY_FREQ - 1):
                    # decay
                    self.A_LR = self.A_LR * self.LR_DECAY
                    self.C_LR = self.C_LR * self.LR_DECAY
                    # save
                    with open(self.ActorLrFile, 'w') as f:
                        f.write(str(self.A_LR))
                    with open(self.CriticLrFile, 'w') as f:
                        f.write(str(self.C_LR))
                    hp.ColorPrint(
                        Fore.LIGHTRED_EX, "Decay LR: A_LR={}, C_LR={}".format(
                            self.A_LR, self.C_LR))
                # copy pi to old pi
                self.sess.run(self.update_oldpi_op)
                # collect data from all workers
                data = [
                    self.SharedStorage['DataQueue'].get()
                    for _ in range(self.SharedStorage['DataQueue'].qsize())
                ]
                data = np.vstack(data)
                s, a, r = data[:, :self.S_DIM], data[:, self.S_DIM:self.S_DIM +
                                                     self.A_SPACE], data[:,
                                                                         -1:]
                adv = self.sess.run(self.advantage, {
                    self.tfs: s,
                    self.tfdc_r: r
                })
                # update actor and critic in a update loop
                for _ in range(self.UpdateDepth):
                    self.sess.run(self.atrain_op, {
                        self.tfs: s,
                        self.tfa: a,
                        self.tfadv: adv
                    })
                    self.sess.run(self.ctrain_op, {
                        self.tfs: s,
                        self.tfdc_r: r
                    })
                '''
                write summary
                '''
                # actor and critic loss
                result = self.sess.run(tf.summary.merge([
                    self.ActorLossSummary, self.CriticLossSummary,
                    self.ppoRatioSummary
                ]),
                                       feed_dict={
                                           self.tfs: s,
                                           self.tfa: a,
                                           self.tfadv: adv,
                                           self.tfdc_r: r
                                       })
                self.writer.add_summary(result, self.UpdateStep)
                self.UpdateStep += 1
                # re-train will not overlap the summaries
                with open(self.UpdateStepFile, 'w') as f:
                    f.write(str(self.UpdateStep))

                # updating finished
                self.SharedStorage['Events']['update'].clear()
                self.SharedStorage['Locks']['counter'].acquire()
                # reset counter
                self.SharedStorage['Counters']['update_counter'] = 0
                self.SharedStorage['Locks']['counter'].release()
                # set collecting available
                self.SharedStorage['Events']['collect'].set()
        hp.ColorPrint(Fore.YELLOW, 'Updator stopped')
示例#4
0
        '-t',
        '--training',
        type=str,
        nargs='?',
        default='Y',
        help=
        'Is this run will be training procedure?\n\"Y\"=Training, \"N\"=Inference',
        required=False)
    args = vars(parser.parse_args())
    # restore the episode count
    EpiStepFile = args['logdir'] + '/EpiStepFile'
    if os.path.exists(EpiStepFile):
        with open(EpiStepFile, 'r') as f:
            SharedStorage['Counters']['ep'] = int(f.read())
            hp.ColorPrint(
                Fore.LIGHTCYAN_EX,
                "Restore Episode:{}".format(SharedStorage['Counters']['ep']))

    GlobalPPO = PPPO.PPO(gym.make(Game).unwrapped,
                         args['logdir'],
                         'model.ckpt',
                         isTraining=args['training'],
                         SharedStorage=SharedStorage,
                         EP_MAX=EP_MAX,
                         GAMMA=GAMMA,
                         A_LR=A_LR,
                         C_LR=C_LR,
                         LR_DECAY=LR_DECAY,
                         LR_DECAY_FREQ=LR_DECAY_FREQ,
                         ClippingEpsilon=ClippingEpsilon,
                         L1Neurons=L1Neurons,