예제 #1
0
    def __init__(self, _name='ppo', _headingCoef=1e-3, _SAVE_TXT=True):
        self.name = _name
        self.headingCoef = _headingCoef
        self.SAVE_TXT = _SAVE_TXT

        if self.SAVE_TXT:
            txtName = 'results/' + self.name + '.txt'
            self.f = open(txtName, 'w')  # Open txt file
            print_n_txt(_f=self.f,
                        _chars='Text name: ' + txtName,
                        _DO_PRINT=True,
                        _DO_SAVE=self.SAVE_TXT)

        self.env = AntEnvCustom(_headingCoef=self.headingCoef)
        self.obs_dim = self.env.observation_space.shape[0]
        self.act_dim = self.env.action_space.shape[0]
        self.env.reset()  # Reset
        # render_img = env.render(mode='rgb_array')
        print("obs_dim:[%d] act_dim:[%d]" % (self.obs_dim, self.act_dim))

        self.obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
        # Logger
        self.env_name = 'Ant'
        now = datetime.utcnow().strftime(
            "%b-%d_%H:%M:%S")  # create unique directories
        self.logger = Logger(logName=self.env_name, now=now, _NOTUSE=True)
        self.aigym_path = os.path.join('/tmp', self.env_name, now)
        # Scaler
        self.scaler = Scaler(self.obs_dim)
        # Value function
        hid1_mult = 10
        self.val_func = NNValueFunction(self.obs_dim, hid1_mult)
        # Policy Function
        kl_targ = 0.003
        policy_logvar = -1.0
        self.policy = Policy(self.obs_dim, self.act_dim, kl_targ, hid1_mult,
                             policy_logvar)
예제 #2
0
 def train(self,_sess,_x_train,_t_train,_x_test,_t_test,
           _max_epoch=10,_batch_size=256,_lr=1e-3,_kp=0.9,
           _LR_SCHEDULE=False,_PRINT_EVERY=10,_VERBOSE_TRAIN=True):
     tf.set_random_seed(0)
     n_train,n_test = _x_train.shape[0],_x_test.shape[0]
     txtName = ('res/res_%s.txt'%(self.name))
     f = open(txtName,'w') # Open txt file
     print_n_txt(_f=f,_chars='Text name: '+txtName)
     print_period = max(1,_max_epoch//_PRINT_EVERY)
     max_iter,max_test_accr = max(n_train//_batch_size,1),0.0
     for epoch in range(_max_epoch+1): # For every epoch 
         _x_train,_t_train = shuffle(_x_train,_t_train) 
         for iter in range(max_iter): # For every iteration in one epoch
             start,end = iter*_batch_size,(iter+1)*_batch_size
             # Learning rate scheduling
             if _LR_SCHEDULE:
                 if epoch < 0.5*_max_epoch:
                     _lr_use = _lr
                 elif epoch < 0.75*_max_epoch:
                     _lr_use = _lr/10.0
                 else:
                     _lr_use = _lr/100.0
             else:
                 _lr_use = _lr
             if self.USE_MIXUP:
                 x_batch = _x_train[start:end,:]
                 t_batch = _t_train[start:end,:]
                 x_batch,t_batch = mixup(x_batch,t_batch,32)
             else:
                 x_batch = _x_train[start:end,:]
                 t_batch = _t_train[start:end,:]
             feeds = {self.x:x_batch,self.t:t_batch,self.rho_ref:self.rho_ref_train,
                      self.kp:_kp,self.lr:_lr_use,self.is_training:True}
             _sess.run(self.optm,feed_dict=feeds)
         # Print training losses, training accuracy, validation accuracy, and test accuracy
         if (epoch%print_period)==0 or (epoch==(_max_epoch)):
             batch_size4print = 512 
             # Compute train loss and accuracy
             max_iter4print = max(n_train//batch_size4print,1)
             train_loss,train_accr,n_temp = 0,0,0
             for iter in range(max_iter4print):
                 start,end = iter*batch_size4print,(iter+1)*batch_size4print
                 feeds_train = {self.x:_x_train[start:end,:],self.t:_t_train[start:end,:]
                              ,self.rho_ref:1.0,self.kp:1.0,self.is_training:False}
                 _train_loss,_train_accr = _sess.run([self.loss_total,self.accr],feed_dict=feeds_train) 
                 _n_temp = end-start; n_temp+=_n_temp
                 train_loss+=(_n_temp*_train_loss); train_accr+=(_n_temp*_train_accr)
             train_loss/=n_temp;train_accr/=n_temp
             # Compute test loss and accuracy
             max_iter4print = max(n_test//batch_size4print,1)
             test_loss,test_accr,n_temp = 0,0,0
             for iter in range(max_iter4print):
                 start,end = iter*batch_size4print,(iter+1)*batch_size4print
                 feeds_test = {self.x:_x_test[start:end,:],self.t:_t_test[start:end,:]
                              ,self.rho_ref:1.0,self.kp:1.0,self.is_training:False}
                 _test_loss,_test_accr = _sess.run([self.loss_total,self.accr],feed_dict=feeds_test) 
                 _n_temp = end-start; n_temp+=_n_temp
                 test_loss+=(_n_temp*_test_loss); test_accr+=(_n_temp*_test_accr)
             test_loss/=n_temp;test_accr/=n_temp
             # Compute max val accr
             if test_accr > max_test_accr:
                 max_test_accr = test_accr
             strTemp = (("[%02d/%d] [Loss] train:%.3f test:%.3f"
                         +" [Accr] train:%.1f%% test:%.1f%% maxTest:%.1f%%")
                    %(epoch,_max_epoch,train_loss,test_loss
                      ,train_accr*100,test_accr*100,max_test_accr*100))
             print_n_txt(_f=f,_chars=strTemp,_DO_PRINT=_VERBOSE_TRAIN)
             self.train_accr,self.test_accr = train_accr,test_accr
     # Done 
     print ("Training finished.")
예제 #3
0
    def train(self,
              _sess,
              _x_train,
              _y_train,
              _lr=1e-3,
              _batch_size=512,
              _max_epoch=1e4,
              _kp=1.0,
              _LR_SCHEDULE=True,
              _PRINT_EVERY=20,
              _PLOT_EVERY=20,
              _SAVE_TXT=True,
              _SAVE_BEST_NET=True,
              _SAVE_FINAL=True,
              _REMOVE_PREVS=True,
              _x_dim4plot=0,
              _x_name4plot=None):

        self.x_dim4plot = _x_dim4plot
        self.x_name4plot = _x_name4plot

        # Remove existing files
        if _REMOVE_PREVS:
            remove_file_if_exists('net/net_%s_best.npz' % (self.name),
                                  _VERBOSE=self.VERBOSE)
            remove_file_if_exists('net/net_%s_best.mat' % (self.name),
                                  _VERBOSE=self.VERBOSE)
            remove_file_if_exists('net/net_%s_final.npz' % (self.name),
                                  _VERBOSE=self.VERBOSE)
            remove_file_if_exists('net/net_%s_final.mat' % (self.name),
                                  _VERBOSE=self.VERBOSE)
            remove_file_if_exists('res/res_%s.txt' % (self.name),
                                  _VERBOSE=self.VERBOSE)

        # Reference training data
        x_train, y_train = _x_train, _y_train
        if len(np.shape(y_train)) == 1:  # if y is a vector
            y_train = np.reshape(y_train, newshape=[-1, 1])  # make it rank two
        self.nzr_x, self.nzr_y = nzr(x_train), nzr(y_train)  # get normalizer

        # Iterate
        if _PRINT_EVERY == 0: print_period = 0
        else: print_period = _max_epoch // _PRINT_EVERY
        if _PLOT_EVERY == 0: plot_period = 0
        else: plot_period = _max_epoch // _PLOT_EVERY

        max_iter = max(x_train.shape[0] // _batch_size, 1)
        best_loss_val = np.inf
        if _SAVE_TXT:
            txt_name = ('res/res_%s.txt' % (self.name))
            f = open(txt_name, 'w')  # Open txt file
            print_n_txt(_f=f,
                        _chars='Text: ' + txt_name,
                        _DO_PRINT=self.VERBOSE)
        for epoch in range((int)(_max_epoch) + 1):  # For every epoch
            x_train, y_train = shuffle(x_train, y_train)
            nzd_x_train, nzd_y_train = self.nzr_x.get_nzdval(
                x_train), self.nzr_y.get_nzdval(y_train)
            for iter in range(max_iter):  # For every iteration
                start, end = iter * _batch_size, (iter + 1) * _batch_size
                if _LR_SCHEDULE:
                    if epoch < 0.5 * _max_epoch:
                        lr_use = _lr
                    elif epoch < 0.75 * _max_epoch:
                        lr_use = _lr / 5.
                    else:
                        lr_use = _lr / 10.
                else:
                    lr_use = _lr
                feeds = {
                    self.x: nzd_x_train[start:end, :],
                    self.y: nzd_y_train[start:end, :],
                    self.kp: _kp,
                    self.lr: lr_use,
                    self.is_training: True
                }
                # Optimize
                _sess.run(self.optm, feeds)

            # Track the Best result
            BEST_FLAG = False
            check_period = _max_epoch // 100
            if (epoch % check_period) == 0:
                # Feed total dataset
                feeds = {
                    self.x: nzd_x_train,
                    self.y: nzd_y_train,
                    self.kp: 1.0,
                    self.is_training: False
                }
                opers = [self.loss_total, self.loss_fit, self.l2_reg]
                loss_val, loss_fit, l2_reg = _sess.run(opers, feeds)
                if (loss_val < best_loss_val) & (epoch >= 3):
                    best_loss_val = loss_val
                    BEST_FLAG = True
                    if _SAVE_BEST_NET:  # Save the current best model
                        if self.VERBOSE:
                            print(
                                "Epoch:[%d] saving current network (best loss:[%.3f])"
                                % (epoch, best_loss_val))
                        self.save2npz(_sess,
                                      _save_name='net/net_%s_best.npz' %
                                      (self.name))
                        self.save2mat_from_npz(
                            _x_train=x_train,
                            _y_train=y_train,
                            _save_name='net/net_%s_best.mat' % (self.name),
                            _npz_path='net/net_%s_best.npz' % (self.name))

            # Print current result
            if (print_period != 0) and ((epoch % print_period) == 0 or
                                        (epoch == (_max_epoch - 1))):  # Print
                feeds = {
                    self.x: nzd_x_train,
                    self.y: nzd_y_train,
                    self.kp: 1.0,
                    self.is_training: False
                }
                opers = [self.loss_total, self.loss_fit, self.l2_reg]
                loss_val, loss_fit, l2_reg = _sess.run(opers, feeds)
                if _SAVE_TXT:
                    str_temp = (
                        "[%d/%d] loss:%.3f(fit:%.3f+l2:%.3f) bestLoss:%.3f" %
                        (epoch, _max_epoch, loss_val, loss_fit, l2_reg,
                         best_loss_val))
                    print_n_txt(_f=f, _chars=str_temp, _DO_PRINT=self.VERBOSE)
                else:
                    if self.VERBOSE | True:
                        print(
                            "[%d/%d] loss:%.3f(fit:%.3f+l2:%.3f) bestLoss:%.3f"
                            % (epoch, _max_epoch, loss_val, loss_fit, l2_reg,
                               best_loss_val))

            # Plot current result
            if (plot_period != 0) and ((epoch % plot_period) == 0 or
                                       (epoch == (_max_epoch - 1))):  # Plot
                # Get loss vals
                feeds = {
                    self.x: nzd_x_train,
                    self.y: nzd_y_train,
                    self.kp: 1.0,
                    self.is_training: False
                }
                opers = [self.loss_total, self.loss_fit, self.l2_reg]
                loss_val, loss_fit, l2_reg = _sess.run(opers, feeds)
                # Output
                nzd_y_test = self.sampler(_sess=_sess, _x=nzd_x_train)
                y_pred = self.nzr_y.get_orgval(nzd_y_test)[:, 0]
                # Plot one dimensions of both input and output
                x_plot, y_plot = x_train[:, self.
                                         x_dim4plot], y_train[:,
                                                              0]  # Traning data
                plt.figure(figsize=(8, 4))
                # plt.axis([np.min(x_plot),np.max(x_plot),np.min(y_plot)-0.1,np.max(y_plot)+0.1])
                h_tr, = plt.plot(x_plot, y_plot, 'k.')  # Plot training data
                h_pr, = plt.plot(x_plot, y_pred, 'b.')  # Plot prediction
                plt.title("[%d/%d] name:[%s] loss_val:[%.3e]" %
                          (epoch, _max_epoch, self.name, loss_val),
                          fontsize=13)
                plt.legend([h_tr, h_pr], ['Train data', 'Predictions'],
                           fontsize=13,
                           loc='upper left')
                if self.x_name4plot != None:
                    plt.xlabel(self.x_name4plot, fontsize=13)
                plt.show()

        # Save final results
        if _SAVE_FINAL:
            self.save2npz(_sess,
                          _save_name='net/net_%s_final.npz' % (self.name))
            self.save2mat_from_npz(
                _x_train=x_train,
                _y_train=y_train,
                _save_name='net/net_%s_final.mat' % (self.name),
                _npz_path='net/net_%s_final.npz' % (self.name))

        if self.VERBOSE:
            print("Train done.")
예제 #4
0
    def train(self,
              _seed=0,
              _maxEpoch=10000,
              _batchSize=50,
              _maxSec=9.0,
              _SAVE_VID=True,
              _MAKE_GIF=False,
              _PLOT_EVERY=10):
        np.random.seed(_seed)
        tf.set_random_seed(_seed)
        trajectories = run_policy(self.env,
                                  self.policy,
                                  self.scaler,
                                  self.logger,
                                  episodes=5,
                                  _maxSec=_maxSec)
        add_value(trajectories,
                  self.val_func)  # add estimated values to episodes
        gamma = 0.995  # Discount factor
        lam = 0.95  # Lambda for GAE
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        print('observes shape:', trajectories[0]['observes'].shape)
        print('actions shape:', trajectories[0]['actions'].shape)
        print('rewards shape:', trajectories[0]['rewards'].shape)
        print('unscaled_obs shape:', trajectories[0]['unscaled_obs'].shape)
        print('values shape:', trajectories[0]['values'].shape)
        print('disc_sum_rew shape:', trajectories[0]['disc_sum_rew'].shape)
        print('advantages shape:', trajectories[0]['advantages'].shape)

        for _epoch in range(_maxEpoch):
            # 1. Run policy
            trajectories = run_policy(self.env,
                                      self.policy,
                                      self.scaler,
                                      self.logger,
                                      episodes=_batchSize,
                                      _maxSec=_maxSec)
            # 2. Get (predict) value from the critic network
            add_value(trajectories,
                      self.val_func)  # add estimated values to episodes
            # 3. Get GAE
            gamma = 0.995  # Discount factor
            lam = 0.95  # Lambda for GAE
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            # log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
            # Update
            self.policy.update(observes, actions, advantages,
                               self.logger)  # update policy
            self.val_func.fit(observes, disc_sum_rew,
                              self.logger)  # update value function
            # logger.write(display=True)  # write logger results to file and stdout

            # Print
            for _tIdx in range(len(trajectories)):
                rs = trajectories[_tIdx]['rewards']
                if _tIdx == 0: rTotal = rs
                else: rTotal = np.concatenate((rTotal, rs))
                # Reward details
            reward_contacts,reward_ctrls,reward_forwards,reward_headings,reward_survives = [],[],[],[],[]
            tickSum = 0
            for _traj in trajectories:
                tickSum += _traj['rewards'].shape[0]
                cTraj = _traj['rDetails']
                for _iIdx in range(len(cTraj)):
                    reward_contacts.append(cTraj[_iIdx]['reward_contact'])
                    reward_ctrls.append(cTraj[_iIdx]['reward_ctrl'])
                    reward_forwards.append(cTraj[_iIdx]['reward_forward'])
                    reward_headings.append(cTraj[_iIdx]['reward_heading'])
                    reward_survives.append(cTraj[_iIdx]['reward_survive'])
            tickAvg = tickSum / _batchSize
            sumRwd = rTotal.sum() / _batchSize
            sumReward_contact = np.asarray(reward_contacts).sum() / _batchSize
            sumReward_ctrl = np.asarray(reward_ctrls).sum() / _batchSize
            sumReward_forward = np.asarray(reward_forwards).sum() / _batchSize
            sumReward_heading = np.asarray(reward_headings).sum() / _batchSize
            sumReward_survive = np.asarray(reward_survives).sum() / _batchSize

            # Print
            str2print = (
                "[%d/%d](#total:%d) sumRwd:[%.3f](cntct:%.3f+ctrl:%.3f+fwd:%.3f+head:%.3f+srv:%.3f) tickAvg:[%d]"
                % (_epoch, _maxEpoch, (_epoch + 1) * _batchSize, sumRwd,
                   sumReward_contact, sumReward_ctrl, sumReward_forward,
                   sumReward_heading, sumReward_survive, tickAvg))
            print_n_txt(_f=self.f,
                        _chars=str2print,
                        _DO_PRINT=True,
                        _DO_SAVE=self.SAVE_TXT)

            # Get status
            stats = self.get_current_stats(_batchSize=_batchSize,
                                           _maxSec=_maxSec)
            # print (stats)
            str2print = (
                "  [eval] sumRwd:[%.3f](cntct:%.3f+ctrl:%.3f+fwd:%.3f+head:%.3f+srv:%.3f) tickAvg:[%d]"
                % (stats['sumRwd'], stats['sumReward_contact'],
                   stats['sumReward_ctrl'], stats['sumReward_forward'],
                   stats['sumReward_heading'], stats['sumReward_survive'],
                   stats['tickAvg']))
            print_n_txt(_f=self.f,
                        _chars=str2print,
                        _DO_PRINT=True,
                        _DO_SAVE=self.SAVE_TXT)

            # SHOW EVERY
            DO_ANIMATE = False
            if ((_epoch % _PLOT_EVERY) == 0) | (_epoch == (_maxEpoch - 1)):
                ret = run_episode_vid(self.env,
                                      self.policy,
                                      self.scaler,
                                      _maxSec=_maxSec)
                print("  [^] sumRwd:[%.3f] Xdisp:[%.3f] hDisp:[%.1f]" %
                      (np.asarray(
                          ret['rewards']).sum(), ret['xDisp'], ret['hDisp']))
                if _MAKE_GIF:
                    display_frames_as_gif(ret['frames'])
                if _SAVE_VID:
                    outputdata = np.asarray(ret['frames']).astype(np.uint8)
                    folderPath = 'vids/%s' % (self.name)
                    if not os.path.exists(folderPath): os.makedirs(folderPath)
                    vidName = folderPath + '/rollout_ppo_epoch%03d.mp4' % (
                        _epoch)
                    skvideo.io.vwrite(vidName, outputdata)
                    print("[%s] saved." % (vidName))
        print("Done.")
예제 #5
0
    def train_dlpg(self,
                   _sess,
                   _seed=0,
                   _maxEpoch=500,
                   _batchSize=100,
                   _nIter4update=1e3,
                   _nPrevConsider=20,
                   _nPrevBestQ2Add=50,
                   _SAVE_VID=True,
                   _MAKE_GIF=False,
                   _PLOT_GRP=False,
                   _PLOT_EVERY=5,
                   _DO_RENDER=True,
                   _SAVE_NET_EVERY=10):
        self.sess = _sess

        # Initialize VAE weights
        np.random.seed(_seed)
        tf.set_random_seed(_seed)
        self.sess.run(tf.global_variables_initializer())

        # Expirence memory
        xList = np.zeros((_batchSize, self.env.actDim * self.nAnchor))
        qList = np.zeros((_batchSize))
        xLists = [''] * _maxEpoch
        qLists = [''] * _maxEpoch

        for _epoch in range(_maxEpoch):
            priorProb = 0.5 * np.exp(
                -4 *
                (_epoch / _maxEpoch)**2)  # Schedule eps-greedish (0.5->0.0)
            levBtw = 0.9 + 0.05 * (1 - priorProb
                                   )  # Schedule leveraged GRP (0.8->0.95)
            xDispList, hDispList = np.zeros((_batchSize)), np.zeros(
                (_batchSize))
            rSumList,rContactSumList,rCtrlSumList,rFwdSumList,rHeadingSumList,rSrvSumList = \
                np.zeros((_batchSize)),np.zeros((_batchSize)),np.zeros((_batchSize)),\
                np.zeros((_batchSize)),np.zeros((_batchSize)),np.zeros((_batchSize))
            for _iter in range(_batchSize):
                # np.random.seed(seed=(_seed+_epoch*_batchSize+_iter)) #

                # -------------------------------------------------------------------------------------------- #
                if (np.random.rand() <
                        priorProb) | (_epoch == 0):  # Sample from prior
                    _, ret = self.unit_rollout_from_grp_prior(self.maxRepeat)
                else:  # Sample from posterior (VAE)
                    sampledX = self.VAE.sample(_sess=self.sess).reshape(
                        (self.nAnchor, self.env.actDim))
                    sampledX[-1, :] = sampledX[0, :]
                    # Clip
                    sampledX = np.clip(sampledX, a_min=-0.2, a_max=1.2)

                    if self.NORMALIZE_SCALE:
                        sampledX = (sampledX - sampledX.min()) / (
                            sampledX.max() - sampledX.min())
                    self.set_anchor_grp_posterior(_anchors=sampledX,
                                                  _levBtw=levBtw)
                    _, ret = self.unit_rollout_from_grp_posterior(
                        self.maxRepeat)
                # -------------------------------------------------------------------------------------------- #

                # Get anchor points of previous rollout
                xInterp = self.get_anchor_from_traj(ret['sampledTraj'])
                xVec = np.reshape(xInterp, newshape=(1, -1))
                # Append rewards
                xList[_iter, :] = xVec
                qList[_iter] = np.asarray(
                    ret['rewards']).sum()  # Sum of rewards!
                xDispList[_iter] = ret['xDisp']
                hDispList[_iter] = ret['hDisp']
                rSumList[_iter] = ret['rSum']
                rContactSumList[_iter] = ret['rContactSum']
                rCtrlSumList[_iter] = ret['rCtrlSum']
                rFwdSumList[_iter] = ret['rFwdSum']
                rHeadingSumList[_iter] = ret['rHeadingSum']
                rSrvSumList[_iter] = ret['rSrvSum']
            # Train
            xLists[_epoch] = xList
            qLists[_epoch] = qList
            # Get the best out of previous episodes
            for _bIdx in range(0, _nPrevConsider):
                if _bIdx == 0:  # Add current one for sure
                    xAccList = xList
                    qAccList = qList
                else:
                    xAccList = np.concatenate(
                        (xAccList, xLists[max(0, _epoch - _bIdx)]), axis=0)
                    qAccList = np.concatenate(
                        (qAccList, qLists[max(0, _epoch - _bIdx)]))
            # Add high q episodes (_nPrevBestQ2Add)
            nAddPrevBest = _nPrevBestQ2Add
            sortedIdx = np.argsort(-qAccList)
            xTrain = xAccList[sortedIdx[:nAddPrevBest], :]
            qTrain = qAccList[sortedIdx[:nAddPrevBest]]
            # Add current episodes (batchSize)
            xTrain = np.concatenate((xTrain, xList), axis=0)
            qTrain = np.concatenate((qTrain, qList))
            # Add random episodes (nRandomAdd=_batchSize)
            nRandomAdd = _batchSize // 5
            randIdx = np.random.permutation(xAccList.shape[0])[:nRandomAdd]
            xRand = xAccList[randIdx, :]
            qRand = qAccList[randIdx]
            xTrain = np.concatenate((xTrain, xRand), axis=0)
            qTrain = np.concatenate((qTrain, qRand))

            # Train
            self.qScaler.reset()  # Reset every update
            self.qScaler.update(qTrain)  # Update Q scaler
            qScale, qOffset = self.qScaler.get()  # Scaler
            scaledQ = qScale * (qTrain - qOffset)
            # print (scaledQ)
            self.VAE.train(_sess=self.sess,
                           _X=xTrain,
                           _Y=None,
                           _C=None,
                           _Q=scaledQ,
                           _maxIter=_nIter4update,
                           _batchSize=128,
                           _PRINT_EVERY=(_nIter4update // 5),
                           _PLOT_EVERY=0,
                           _KL_SCHEDULE=True,
                           _INIT_VAR=False)
            # Print
            str2print = (
                "[%d/%d](#total:%d) avgQ:[%.3f] XdispMean:[%.3f] XdispVar:[%.3f] absHdispMean:[%.1f] priorProb:[%.2f]"
                % (_epoch, _maxEpoch,
                   (_epoch + 1) * _batchSize, qList.mean(), xDispList.mean(),
                   xDispList.var(), np.abs(hDispList).mean(), priorProb))
            print_n_txt(_f=self.f,
                        _chars=str2print,
                        _DO_PRINT=True,
                        _DO_SAVE=self.SAVE_TXT)

            str2print = (
                " rSum:[%.3f] = (contact:%.3f+ctrl:%.3f+fwd:%.3f+heading:%.3f+survive:%.3f) [rSumMax:%.3f]"
                % (rSumList.mean(), rContactSumList.mean(),
                   rCtrlSumList.mean(), rFwdSumList.mean(),
                   rHeadingSumList.mean(), rSrvSumList.mean(), rSumList.max()))
            print_n_txt(_f=self.f,
                        _chars=str2print,
                        _DO_PRINT=True,
                        _DO_SAVE=self.SAVE_TXT)

            # Print current Q using GRP mean
            stats = self.get_current_stats()
            str2print = (
                "[Stat] Q:[%.3f]=(cnt:%.1f+strl:%.1f+fwd:%.1f+hd:%.1f+srv:%.1f) xDispMean:[%.3f] hAbsMean:[%.1f]"
                % (stats['rSumMean'], stats['rContactSumMean'],
                   stats['rCtrlSumMean'], stats['rFwdSumMean'],
                   stats['rHeadingSumMean'], stats['rSrvSumMean'],
                   stats['xDispMean'], stats['hAbsMean']))
            print_n_txt(_f=self.f,
                        _chars=str2print,
                        _DO_PRINT=True,
                        _DO_SAVE=self.SAVE_TXT)
            """
            stats = {'xDispMean':xDispMean,'hAbsMean':hAbsMean,hSqMean':hSqMean,'rSumMean':rSumMean,
            'rContactSumMean':rContactSumMean,'rCtrlSumMean':rCtrlSumMean,'rFwdSumMean':rFwdSumMean,
            'rHeadingSumMean':rHeadingSumMean,'rSrvSumMean':rSrvSumMean}
            """

            # SHOW EVERY
            if ((_epoch % _PLOT_EVERY) == 0) | (_epoch == (_maxEpoch - 1)):
                # Rollout
                sampledX = self.VAE.sample(_sess=self.sess).reshape(
                    (self.nAnchor, self.env.actDim))
                sampledX[-1, :] = sampledX[0, :]
                # Clip
                sampledX = np.clip(sampledX, a_min=-0.2, a_max=1.2)
                if self.NORMALIZE_SCALE:
                    sampledX = (sampledX - sampledX.min()) / (sampledX.max() -
                                                              sampledX.min())
                self.set_anchor_grp_posterior(_anchors=sampledX,
                                              _levBtw=levBtw)
                _, ret = self.unit_rollout_from_grp_mean(
                    _maxRepeat=self.maxRepeat, _DO_RENDER=_DO_RENDER)
                str2print = (
                    "    [GRP mean] sumRwd:%.3f=cntct:%.2f+ctrl:%.2f+fwd:%.2f+hd:%.2f+srv:%.2f) xD:[%.3f] hD:[%.1f]"
                    % (ret['rSum'], ret['rContactSum'], ret['rCtrlSum'],
                       ret['rFwdSum'], ret['rHeadingSum'], ret['rSrvSum'],
                       ret['xDisp'], ret['hDisp']))
                print_n_txt(_f=self.f,
                            _chars=str2print,
                            _DO_PRINT=True,
                            _DO_SAVE=self.SAVE_TXT)
                # Make video using GRP mean path
                if _SAVE_VID:
                    outputdata = np.asarray(ret['frames']).astype(np.uint8)
                    folderPath = 'vids/%s' % (self.name)
                    if not os.path.exists(folderPath): os.makedirs(folderPath)
                    vidName = folderPath + '/rollout_dlpg_epoch%03d.mp4' % (
                        _epoch)
                    skvideo.io.vwrite(vidName, outputdata)
                    str2print = ("     Video [%s] saved." % (vidName))
                    print_n_txt(_f=self.f,
                                _chars=str2print,
                                _DO_PRINT=True,
                                _DO_SAVE=self.SAVE_TXT)
                # Make GIF
                if _MAKE_GIF:
                    NSKIP = 3  # For memory issues
                    display_frames_as_gif(ret['frames'][::NSKIP],
                                          _intv_ms=20,
                                          _figsize=(8, 8),
                                          _fontsize=15,
                                          _titleStrs=ret['titleStrs'][::NSKIP])
                # Plot sampled trajectories
                if _PLOT_GRP:
                    nrTrajectories2plot = 5
                    for _i in range(nrTrajectories2plot):
                        # np.random.seed(seed=_i)
                        sampledX = self.VAE.sample(_sess=self.sess).reshape(
                            (self.nAnchor, self.env.actDim))
                        sampledX[-1, :] = sampledX[0, :]
                        # Clip
                        sampledX = np.clip(sampledX, a_min=-0.2, a_max=1.2)
                        if self.NORMALIZE_SCALE:
                            sampledX = (sampledX - sampledX.min()) / (
                                sampledX.max() - sampledX.min())
                        self.set_anchor_grp_posterior(_anchors=sampledX,
                                                      _levBtw=levBtw)
                        fig = self.GRPposterior.plot_all(_nPath=1,
                                                         _figsize=(8, 3))
                        # Save image
                        folderPath = 'pics/%s' % (self.name)
                        if not os.path.exists(folderPath):
                            os.makedirs(folderPath)
                        saveName = folderPath + '/grp_epoch%04d_%d.png' % (
                            _epoch, _i)
                        fig.savefig(saveName)

                        # Rollout
                        _, ret = self.unit_rollout_from_grp_mean(
                            _maxRepeat=self.maxRepeat, _DO_RENDER=False)
                        str2print = (
                            "    [GRP-%d] sumRwd:%.3f=cntct:%.2f+ctrl:%.2f+fwd:%.2f+hd:%.2f+srv:%.2f) xD:[%.3f] hD:[%.1f]"
                            % (_i, ret['rSum'], ret['rContactSum'],
                               ret['rCtrlSum'], ret['rFwdSum'],
                               ret['rHeadingSum'], ret['rSrvSum'],
                               ret['xDisp'], ret['hDisp']))
                        print_n_txt(_f=self.f,
                                    _chars=str2print,
                                    _DO_PRINT=True,
                                    _DO_SAVE=self.SAVE_TXT)

            # Save network every
            if ((_epoch % _SAVE_NET_EVERY) == 0) | (_epoch == (_maxEpoch - 1)):
                folderPath = 'nets/%s' % (self.name)
                if not os.path.exists(folderPath): os.makedirs(folderPath)
                saveName = folderPath + '/net_dlpg_epoch%04d.npz' % (_epoch)
                self.save_net(_sess=_sess, _savename=saveName)
예제 #6
0
    def __init__(self,
                 _name='Ant',
                 _headingCoef=1e-4,
                 _tMax=3,
                 _nAnchor=20,
                 _maxRepeat=3,
                 _hypGainPrior=1 / 3,
                 _hypLenPrior=1 / 4,
                 _hypGainPost=1 / 3,
                 _hypLenPost=1 / 4,
                 _levBtw=0.8,
                 _pGain=0.01,
                 _zDim=16,
                 _hDims=[64, 64],
                 _vaeActv=tf.nn.elu,
                 _vaeOutActv=tf.nn.sigmoid,
                 _vaeQactv=None,
                 _entRegCoef=1e-2,
                 _klMinTh=0.0,
                 _PLOT_GRP=True,
                 _SAVE_TXT=True,
                 _VERBOSE=True):
        # Some parameters
        self.name = _name
        self.headingCoef = _headingCoef
        self.tMin = 0
        self.tMax = _tMax
        self.nAnchor = _nAnchor
        self.maxRepeat = _maxRepeat
        self.SAVE_TXT = _SAVE_TXT
        self.VERBOSE = _VERBOSE

        # Noramlize trajecotry
        self.NORMALIZE_SCALE = False

        if self.SAVE_TXT:
            folderPath = 'results'
            if not os.path.exists(folderPath): os.makedirs(folderPath)
            txtName = 'results/' + self.name + '.txt'
            self.f = open(txtName, 'w')  # Open txt file
            print_n_txt(_f=self.f,
                        _chars='Text name: ' + txtName,
                        _DO_PRINT=True,
                        _DO_SAVE=self.SAVE_TXT)

        # Initialize Ant gym
        self.env = AntEnvCustom(_headingCoef=self.headingCoef)
        # GRP sampler (prior)
        nDataPrior = 2
        nTest = (int)((self.tMax - self.tMin) / self.env.dt)
        tData = np.linspace(start=self.tMin, stop=self.tMax,
                            num=nDataPrior).reshape((-1, 1))
        xData = np.random.rand(nDataPrior, self.env.actDim)  # Random positions
        # xData[0,:] = (xData[0,:]+xData[-1,:])/2.0
        xData[-1, :] = xData[0, :]
        lData = np.ones(shape=(nDataPrior, 1))
        tTest = np.linspace(start=self.tMin, stop=self.tMax,
                            num=nTest).reshape((-1, 1))
        lTest = np.ones(shape=(nTest, 1))

        # hyp = {'gain':1/3,'len':1/4,'noise':1e-8} # <= This worked fine
        hypPrior = {'gain': _hypGainPrior, 'len': _hypLenPrior, 'noise': 1e-10}
        self.GRPprior = lgrp_class(_name='GPR Prior',
                                   _tData=tData,
                                   _xData=xData,
                                   _lData=lData,
                                   _tTest=tTest,
                                   _lTest=lTest,
                                   _hyp=hypPrior)

        # GRP posterior
        tData = np.linspace(start=self.tMin, stop=self.tMax,
                            num=self.nAnchor).reshape((-1, 1))
        xData = np.random.rand(self.nAnchor,
                               self.env.actDim)  # Random positions
        lData = np.ones(shape=(self.nAnchor, 1))
        lData[1:self.nAnchor - 1] = _levBtw
        hypPost = {'gain': _hypGainPost, 'len': _hypLenPost, 'noise': 1e-10}
        self.GRPposterior = lgrp_class(_name='GPR Posterior',
                                       _tData=tData,
                                       _xData=xData,
                                       _lData=lData,
                                       _tTest=tTest,
                                       _lTest=lTest,
                                       _hyp=hypPost)
        if _PLOT_GRP:
            self.GRPprior.plot_all(_nPath=10, _figsize=(12, 4))
            self.GRPposterior.plot_all(_nPath=10, _figsize=(12, 4))

        # PID controller (Kp=0.01,Ki=0.00001,Kd=0.002,windup=5000)
        self.PID = PID_class(Kp=_pGain,
                             Ki=0.00001,
                             Kd=0.002,
                             windup=5000,
                             sample_time=self.env.dt,
                             dim=self.env.actDim)
        # VAE (this will be our policy function)
        optm = tf.train.AdamOptimizer
        optmParam = {
            'lr': 0.0005,
            'beta1': 0.9,
            'beta2': 0.999,
            'epsilon': 1e-8
        }
        # optm = tf.train.GradientDescentOptimizer
        # optmParam = {'lr':0.002}
        self.VAE = vae_class(_name=self.name,
                             _xDim=self.nAnchor * self.env.actDim,
                             _zDim=_zDim,
                             _hDims=_hDims,
                             _cDim=0,
                             _actv=_vaeActv,
                             _outActv=_vaeOutActv,
                             _qActv=_vaeQactv,
                             _bn=None,
                             _entRegCoef=_entRegCoef,
                             _klMinTh=_klMinTh,
                             _optimizer=optm,
                             _optm_param=optmParam,
                             _VERBOSE=False)
        # Reward Scaler
        self.qScaler = Scaler(1)
        # Check parameters
        self.check_params()