def __init__(self, _name='ppo', _headingCoef=1e-3, _SAVE_TXT=True): self.name = _name self.headingCoef = _headingCoef self.SAVE_TXT = _SAVE_TXT if self.SAVE_TXT: txtName = 'results/' + self.name + '.txt' self.f = open(txtName, 'w') # Open txt file print_n_txt(_f=self.f, _chars='Text name: ' + txtName, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) self.env = AntEnvCustom(_headingCoef=self.headingCoef) self.obs_dim = self.env.observation_space.shape[0] self.act_dim = self.env.action_space.shape[0] self.env.reset() # Reset # render_img = env.render(mode='rgb_array') print("obs_dim:[%d] act_dim:[%d]" % (self.obs_dim, self.act_dim)) self.obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # Logger self.env_name = 'Ant' now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories self.logger = Logger(logName=self.env_name, now=now, _NOTUSE=True) self.aigym_path = os.path.join('/tmp', self.env_name, now) # Scaler self.scaler = Scaler(self.obs_dim) # Value function hid1_mult = 10 self.val_func = NNValueFunction(self.obs_dim, hid1_mult) # Policy Function kl_targ = 0.003 policy_logvar = -1.0 self.policy = Policy(self.obs_dim, self.act_dim, kl_targ, hid1_mult, policy_logvar)
def train(self,_sess,_x_train,_t_train,_x_test,_t_test, _max_epoch=10,_batch_size=256,_lr=1e-3,_kp=0.9, _LR_SCHEDULE=False,_PRINT_EVERY=10,_VERBOSE_TRAIN=True): tf.set_random_seed(0) n_train,n_test = _x_train.shape[0],_x_test.shape[0] txtName = ('res/res_%s.txt'%(self.name)) f = open(txtName,'w') # Open txt file print_n_txt(_f=f,_chars='Text name: '+txtName) print_period = max(1,_max_epoch//_PRINT_EVERY) max_iter,max_test_accr = max(n_train//_batch_size,1),0.0 for epoch in range(_max_epoch+1): # For every epoch _x_train,_t_train = shuffle(_x_train,_t_train) for iter in range(max_iter): # For every iteration in one epoch start,end = iter*_batch_size,(iter+1)*_batch_size # Learning rate scheduling if _LR_SCHEDULE: if epoch < 0.5*_max_epoch: _lr_use = _lr elif epoch < 0.75*_max_epoch: _lr_use = _lr/10.0 else: _lr_use = _lr/100.0 else: _lr_use = _lr if self.USE_MIXUP: x_batch = _x_train[start:end,:] t_batch = _t_train[start:end,:] x_batch,t_batch = mixup(x_batch,t_batch,32) else: x_batch = _x_train[start:end,:] t_batch = _t_train[start:end,:] feeds = {self.x:x_batch,self.t:t_batch,self.rho_ref:self.rho_ref_train, self.kp:_kp,self.lr:_lr_use,self.is_training:True} _sess.run(self.optm,feed_dict=feeds) # Print training losses, training accuracy, validation accuracy, and test accuracy if (epoch%print_period)==0 or (epoch==(_max_epoch)): batch_size4print = 512 # Compute train loss and accuracy max_iter4print = max(n_train//batch_size4print,1) train_loss,train_accr,n_temp = 0,0,0 for iter in range(max_iter4print): start,end = iter*batch_size4print,(iter+1)*batch_size4print feeds_train = {self.x:_x_train[start:end,:],self.t:_t_train[start:end,:] ,self.rho_ref:1.0,self.kp:1.0,self.is_training:False} _train_loss,_train_accr = _sess.run([self.loss_total,self.accr],feed_dict=feeds_train) _n_temp = end-start; n_temp+=_n_temp train_loss+=(_n_temp*_train_loss); train_accr+=(_n_temp*_train_accr) train_loss/=n_temp;train_accr/=n_temp # Compute test loss and accuracy max_iter4print = max(n_test//batch_size4print,1) test_loss,test_accr,n_temp = 0,0,0 for iter in range(max_iter4print): start,end = iter*batch_size4print,(iter+1)*batch_size4print feeds_test = {self.x:_x_test[start:end,:],self.t:_t_test[start:end,:] ,self.rho_ref:1.0,self.kp:1.0,self.is_training:False} _test_loss,_test_accr = _sess.run([self.loss_total,self.accr],feed_dict=feeds_test) _n_temp = end-start; n_temp+=_n_temp test_loss+=(_n_temp*_test_loss); test_accr+=(_n_temp*_test_accr) test_loss/=n_temp;test_accr/=n_temp # Compute max val accr if test_accr > max_test_accr: max_test_accr = test_accr strTemp = (("[%02d/%d] [Loss] train:%.3f test:%.3f" +" [Accr] train:%.1f%% test:%.1f%% maxTest:%.1f%%") %(epoch,_max_epoch,train_loss,test_loss ,train_accr*100,test_accr*100,max_test_accr*100)) print_n_txt(_f=f,_chars=strTemp,_DO_PRINT=_VERBOSE_TRAIN) self.train_accr,self.test_accr = train_accr,test_accr # Done print ("Training finished.")
def train(self, _sess, _x_train, _y_train, _lr=1e-3, _batch_size=512, _max_epoch=1e4, _kp=1.0, _LR_SCHEDULE=True, _PRINT_EVERY=20, _PLOT_EVERY=20, _SAVE_TXT=True, _SAVE_BEST_NET=True, _SAVE_FINAL=True, _REMOVE_PREVS=True, _x_dim4plot=0, _x_name4plot=None): self.x_dim4plot = _x_dim4plot self.x_name4plot = _x_name4plot # Remove existing files if _REMOVE_PREVS: remove_file_if_exists('net/net_%s_best.npz' % (self.name), _VERBOSE=self.VERBOSE) remove_file_if_exists('net/net_%s_best.mat' % (self.name), _VERBOSE=self.VERBOSE) remove_file_if_exists('net/net_%s_final.npz' % (self.name), _VERBOSE=self.VERBOSE) remove_file_if_exists('net/net_%s_final.mat' % (self.name), _VERBOSE=self.VERBOSE) remove_file_if_exists('res/res_%s.txt' % (self.name), _VERBOSE=self.VERBOSE) # Reference training data x_train, y_train = _x_train, _y_train if len(np.shape(y_train)) == 1: # if y is a vector y_train = np.reshape(y_train, newshape=[-1, 1]) # make it rank two self.nzr_x, self.nzr_y = nzr(x_train), nzr(y_train) # get normalizer # Iterate if _PRINT_EVERY == 0: print_period = 0 else: print_period = _max_epoch // _PRINT_EVERY if _PLOT_EVERY == 0: plot_period = 0 else: plot_period = _max_epoch // _PLOT_EVERY max_iter = max(x_train.shape[0] // _batch_size, 1) best_loss_val = np.inf if _SAVE_TXT: txt_name = ('res/res_%s.txt' % (self.name)) f = open(txt_name, 'w') # Open txt file print_n_txt(_f=f, _chars='Text: ' + txt_name, _DO_PRINT=self.VERBOSE) for epoch in range((int)(_max_epoch) + 1): # For every epoch x_train, y_train = shuffle(x_train, y_train) nzd_x_train, nzd_y_train = self.nzr_x.get_nzdval( x_train), self.nzr_y.get_nzdval(y_train) for iter in range(max_iter): # For every iteration start, end = iter * _batch_size, (iter + 1) * _batch_size if _LR_SCHEDULE: if epoch < 0.5 * _max_epoch: lr_use = _lr elif epoch < 0.75 * _max_epoch: lr_use = _lr / 5. else: lr_use = _lr / 10. else: lr_use = _lr feeds = { self.x: nzd_x_train[start:end, :], self.y: nzd_y_train[start:end, :], self.kp: _kp, self.lr: lr_use, self.is_training: True } # Optimize _sess.run(self.optm, feeds) # Track the Best result BEST_FLAG = False check_period = _max_epoch // 100 if (epoch % check_period) == 0: # Feed total dataset feeds = { self.x: nzd_x_train, self.y: nzd_y_train, self.kp: 1.0, self.is_training: False } opers = [self.loss_total, self.loss_fit, self.l2_reg] loss_val, loss_fit, l2_reg = _sess.run(opers, feeds) if (loss_val < best_loss_val) & (epoch >= 3): best_loss_val = loss_val BEST_FLAG = True if _SAVE_BEST_NET: # Save the current best model if self.VERBOSE: print( "Epoch:[%d] saving current network (best loss:[%.3f])" % (epoch, best_loss_val)) self.save2npz(_sess, _save_name='net/net_%s_best.npz' % (self.name)) self.save2mat_from_npz( _x_train=x_train, _y_train=y_train, _save_name='net/net_%s_best.mat' % (self.name), _npz_path='net/net_%s_best.npz' % (self.name)) # Print current result if (print_period != 0) and ((epoch % print_period) == 0 or (epoch == (_max_epoch - 1))): # Print feeds = { self.x: nzd_x_train, self.y: nzd_y_train, self.kp: 1.0, self.is_training: False } opers = [self.loss_total, self.loss_fit, self.l2_reg] loss_val, loss_fit, l2_reg = _sess.run(opers, feeds) if _SAVE_TXT: str_temp = ( "[%d/%d] loss:%.3f(fit:%.3f+l2:%.3f) bestLoss:%.3f" % (epoch, _max_epoch, loss_val, loss_fit, l2_reg, best_loss_val)) print_n_txt(_f=f, _chars=str_temp, _DO_PRINT=self.VERBOSE) else: if self.VERBOSE | True: print( "[%d/%d] loss:%.3f(fit:%.3f+l2:%.3f) bestLoss:%.3f" % (epoch, _max_epoch, loss_val, loss_fit, l2_reg, best_loss_val)) # Plot current result if (plot_period != 0) and ((epoch % plot_period) == 0 or (epoch == (_max_epoch - 1))): # Plot # Get loss vals feeds = { self.x: nzd_x_train, self.y: nzd_y_train, self.kp: 1.0, self.is_training: False } opers = [self.loss_total, self.loss_fit, self.l2_reg] loss_val, loss_fit, l2_reg = _sess.run(opers, feeds) # Output nzd_y_test = self.sampler(_sess=_sess, _x=nzd_x_train) y_pred = self.nzr_y.get_orgval(nzd_y_test)[:, 0] # Plot one dimensions of both input and output x_plot, y_plot = x_train[:, self. x_dim4plot], y_train[:, 0] # Traning data plt.figure(figsize=(8, 4)) # plt.axis([np.min(x_plot),np.max(x_plot),np.min(y_plot)-0.1,np.max(y_plot)+0.1]) h_tr, = plt.plot(x_plot, y_plot, 'k.') # Plot training data h_pr, = plt.plot(x_plot, y_pred, 'b.') # Plot prediction plt.title("[%d/%d] name:[%s] loss_val:[%.3e]" % (epoch, _max_epoch, self.name, loss_val), fontsize=13) plt.legend([h_tr, h_pr], ['Train data', 'Predictions'], fontsize=13, loc='upper left') if self.x_name4plot != None: plt.xlabel(self.x_name4plot, fontsize=13) plt.show() # Save final results if _SAVE_FINAL: self.save2npz(_sess, _save_name='net/net_%s_final.npz' % (self.name)) self.save2mat_from_npz( _x_train=x_train, _y_train=y_train, _save_name='net/net_%s_final.mat' % (self.name), _npz_path='net/net_%s_final.npz' % (self.name)) if self.VERBOSE: print("Train done.")
def train(self, _seed=0, _maxEpoch=10000, _batchSize=50, _maxSec=9.0, _SAVE_VID=True, _MAKE_GIF=False, _PLOT_EVERY=10): np.random.seed(_seed) tf.set_random_seed(_seed) trajectories = run_policy(self.env, self.policy, self.scaler, self.logger, episodes=5, _maxSec=_maxSec) add_value(trajectories, self.val_func) # add estimated values to episodes gamma = 0.995 # Discount factor lam = 0.95 # Lambda for GAE add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage print('observes shape:', trajectories[0]['observes'].shape) print('actions shape:', trajectories[0]['actions'].shape) print('rewards shape:', trajectories[0]['rewards'].shape) print('unscaled_obs shape:', trajectories[0]['unscaled_obs'].shape) print('values shape:', trajectories[0]['values'].shape) print('disc_sum_rew shape:', trajectories[0]['disc_sum_rew'].shape) print('advantages shape:', trajectories[0]['advantages'].shape) for _epoch in range(_maxEpoch): # 1. Run policy trajectories = run_policy(self.env, self.policy, self.scaler, self.logger, episodes=_batchSize, _maxSec=_maxSec) # 2. Get (predict) value from the critic network add_value(trajectories, self.val_func) # add estimated values to episodes # 3. Get GAE gamma = 0.995 # Discount factor lam = 0.95 # Lambda for GAE add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: # log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) # Update self.policy.update(observes, actions, advantages, self.logger) # update policy self.val_func.fit(observes, disc_sum_rew, self.logger) # update value function # logger.write(display=True) # write logger results to file and stdout # Print for _tIdx in range(len(trajectories)): rs = trajectories[_tIdx]['rewards'] if _tIdx == 0: rTotal = rs else: rTotal = np.concatenate((rTotal, rs)) # Reward details reward_contacts,reward_ctrls,reward_forwards,reward_headings,reward_survives = [],[],[],[],[] tickSum = 0 for _traj in trajectories: tickSum += _traj['rewards'].shape[0] cTraj = _traj['rDetails'] for _iIdx in range(len(cTraj)): reward_contacts.append(cTraj[_iIdx]['reward_contact']) reward_ctrls.append(cTraj[_iIdx]['reward_ctrl']) reward_forwards.append(cTraj[_iIdx]['reward_forward']) reward_headings.append(cTraj[_iIdx]['reward_heading']) reward_survives.append(cTraj[_iIdx]['reward_survive']) tickAvg = tickSum / _batchSize sumRwd = rTotal.sum() / _batchSize sumReward_contact = np.asarray(reward_contacts).sum() / _batchSize sumReward_ctrl = np.asarray(reward_ctrls).sum() / _batchSize sumReward_forward = np.asarray(reward_forwards).sum() / _batchSize sumReward_heading = np.asarray(reward_headings).sum() / _batchSize sumReward_survive = np.asarray(reward_survives).sum() / _batchSize # Print str2print = ( "[%d/%d](#total:%d) sumRwd:[%.3f](cntct:%.3f+ctrl:%.3f+fwd:%.3f+head:%.3f+srv:%.3f) tickAvg:[%d]" % (_epoch, _maxEpoch, (_epoch + 1) * _batchSize, sumRwd, sumReward_contact, sumReward_ctrl, sumReward_forward, sumReward_heading, sumReward_survive, tickAvg)) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) # Get status stats = self.get_current_stats(_batchSize=_batchSize, _maxSec=_maxSec) # print (stats) str2print = ( " [eval] sumRwd:[%.3f](cntct:%.3f+ctrl:%.3f+fwd:%.3f+head:%.3f+srv:%.3f) tickAvg:[%d]" % (stats['sumRwd'], stats['sumReward_contact'], stats['sumReward_ctrl'], stats['sumReward_forward'], stats['sumReward_heading'], stats['sumReward_survive'], stats['tickAvg'])) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) # SHOW EVERY DO_ANIMATE = False if ((_epoch % _PLOT_EVERY) == 0) | (_epoch == (_maxEpoch - 1)): ret = run_episode_vid(self.env, self.policy, self.scaler, _maxSec=_maxSec) print(" [^] sumRwd:[%.3f] Xdisp:[%.3f] hDisp:[%.1f]" % (np.asarray( ret['rewards']).sum(), ret['xDisp'], ret['hDisp'])) if _MAKE_GIF: display_frames_as_gif(ret['frames']) if _SAVE_VID: outputdata = np.asarray(ret['frames']).astype(np.uint8) folderPath = 'vids/%s' % (self.name) if not os.path.exists(folderPath): os.makedirs(folderPath) vidName = folderPath + '/rollout_ppo_epoch%03d.mp4' % ( _epoch) skvideo.io.vwrite(vidName, outputdata) print("[%s] saved." % (vidName)) print("Done.")
def train_dlpg(self, _sess, _seed=0, _maxEpoch=500, _batchSize=100, _nIter4update=1e3, _nPrevConsider=20, _nPrevBestQ2Add=50, _SAVE_VID=True, _MAKE_GIF=False, _PLOT_GRP=False, _PLOT_EVERY=5, _DO_RENDER=True, _SAVE_NET_EVERY=10): self.sess = _sess # Initialize VAE weights np.random.seed(_seed) tf.set_random_seed(_seed) self.sess.run(tf.global_variables_initializer()) # Expirence memory xList = np.zeros((_batchSize, self.env.actDim * self.nAnchor)) qList = np.zeros((_batchSize)) xLists = [''] * _maxEpoch qLists = [''] * _maxEpoch for _epoch in range(_maxEpoch): priorProb = 0.5 * np.exp( -4 * (_epoch / _maxEpoch)**2) # Schedule eps-greedish (0.5->0.0) levBtw = 0.9 + 0.05 * (1 - priorProb ) # Schedule leveraged GRP (0.8->0.95) xDispList, hDispList = np.zeros((_batchSize)), np.zeros( (_batchSize)) rSumList,rContactSumList,rCtrlSumList,rFwdSumList,rHeadingSumList,rSrvSumList = \ np.zeros((_batchSize)),np.zeros((_batchSize)),np.zeros((_batchSize)),\ np.zeros((_batchSize)),np.zeros((_batchSize)),np.zeros((_batchSize)) for _iter in range(_batchSize): # np.random.seed(seed=(_seed+_epoch*_batchSize+_iter)) # # -------------------------------------------------------------------------------------------- # if (np.random.rand() < priorProb) | (_epoch == 0): # Sample from prior _, ret = self.unit_rollout_from_grp_prior(self.maxRepeat) else: # Sample from posterior (VAE) sampledX = self.VAE.sample(_sess=self.sess).reshape( (self.nAnchor, self.env.actDim)) sampledX[-1, :] = sampledX[0, :] # Clip sampledX = np.clip(sampledX, a_min=-0.2, a_max=1.2) if self.NORMALIZE_SCALE: sampledX = (sampledX - sampledX.min()) / ( sampledX.max() - sampledX.min()) self.set_anchor_grp_posterior(_anchors=sampledX, _levBtw=levBtw) _, ret = self.unit_rollout_from_grp_posterior( self.maxRepeat) # -------------------------------------------------------------------------------------------- # # Get anchor points of previous rollout xInterp = self.get_anchor_from_traj(ret['sampledTraj']) xVec = np.reshape(xInterp, newshape=(1, -1)) # Append rewards xList[_iter, :] = xVec qList[_iter] = np.asarray( ret['rewards']).sum() # Sum of rewards! xDispList[_iter] = ret['xDisp'] hDispList[_iter] = ret['hDisp'] rSumList[_iter] = ret['rSum'] rContactSumList[_iter] = ret['rContactSum'] rCtrlSumList[_iter] = ret['rCtrlSum'] rFwdSumList[_iter] = ret['rFwdSum'] rHeadingSumList[_iter] = ret['rHeadingSum'] rSrvSumList[_iter] = ret['rSrvSum'] # Train xLists[_epoch] = xList qLists[_epoch] = qList # Get the best out of previous episodes for _bIdx in range(0, _nPrevConsider): if _bIdx == 0: # Add current one for sure xAccList = xList qAccList = qList else: xAccList = np.concatenate( (xAccList, xLists[max(0, _epoch - _bIdx)]), axis=0) qAccList = np.concatenate( (qAccList, qLists[max(0, _epoch - _bIdx)])) # Add high q episodes (_nPrevBestQ2Add) nAddPrevBest = _nPrevBestQ2Add sortedIdx = np.argsort(-qAccList) xTrain = xAccList[sortedIdx[:nAddPrevBest], :] qTrain = qAccList[sortedIdx[:nAddPrevBest]] # Add current episodes (batchSize) xTrain = np.concatenate((xTrain, xList), axis=0) qTrain = np.concatenate((qTrain, qList)) # Add random episodes (nRandomAdd=_batchSize) nRandomAdd = _batchSize // 5 randIdx = np.random.permutation(xAccList.shape[0])[:nRandomAdd] xRand = xAccList[randIdx, :] qRand = qAccList[randIdx] xTrain = np.concatenate((xTrain, xRand), axis=0) qTrain = np.concatenate((qTrain, qRand)) # Train self.qScaler.reset() # Reset every update self.qScaler.update(qTrain) # Update Q scaler qScale, qOffset = self.qScaler.get() # Scaler scaledQ = qScale * (qTrain - qOffset) # print (scaledQ) self.VAE.train(_sess=self.sess, _X=xTrain, _Y=None, _C=None, _Q=scaledQ, _maxIter=_nIter4update, _batchSize=128, _PRINT_EVERY=(_nIter4update // 5), _PLOT_EVERY=0, _KL_SCHEDULE=True, _INIT_VAR=False) # Print str2print = ( "[%d/%d](#total:%d) avgQ:[%.3f] XdispMean:[%.3f] XdispVar:[%.3f] absHdispMean:[%.1f] priorProb:[%.2f]" % (_epoch, _maxEpoch, (_epoch + 1) * _batchSize, qList.mean(), xDispList.mean(), xDispList.var(), np.abs(hDispList).mean(), priorProb)) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) str2print = ( " rSum:[%.3f] = (contact:%.3f+ctrl:%.3f+fwd:%.3f+heading:%.3f+survive:%.3f) [rSumMax:%.3f]" % (rSumList.mean(), rContactSumList.mean(), rCtrlSumList.mean(), rFwdSumList.mean(), rHeadingSumList.mean(), rSrvSumList.mean(), rSumList.max())) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) # Print current Q using GRP mean stats = self.get_current_stats() str2print = ( "[Stat] Q:[%.3f]=(cnt:%.1f+strl:%.1f+fwd:%.1f+hd:%.1f+srv:%.1f) xDispMean:[%.3f] hAbsMean:[%.1f]" % (stats['rSumMean'], stats['rContactSumMean'], stats['rCtrlSumMean'], stats['rFwdSumMean'], stats['rHeadingSumMean'], stats['rSrvSumMean'], stats['xDispMean'], stats['hAbsMean'])) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) """ stats = {'xDispMean':xDispMean,'hAbsMean':hAbsMean,hSqMean':hSqMean,'rSumMean':rSumMean, 'rContactSumMean':rContactSumMean,'rCtrlSumMean':rCtrlSumMean,'rFwdSumMean':rFwdSumMean, 'rHeadingSumMean':rHeadingSumMean,'rSrvSumMean':rSrvSumMean} """ # SHOW EVERY if ((_epoch % _PLOT_EVERY) == 0) | (_epoch == (_maxEpoch - 1)): # Rollout sampledX = self.VAE.sample(_sess=self.sess).reshape( (self.nAnchor, self.env.actDim)) sampledX[-1, :] = sampledX[0, :] # Clip sampledX = np.clip(sampledX, a_min=-0.2, a_max=1.2) if self.NORMALIZE_SCALE: sampledX = (sampledX - sampledX.min()) / (sampledX.max() - sampledX.min()) self.set_anchor_grp_posterior(_anchors=sampledX, _levBtw=levBtw) _, ret = self.unit_rollout_from_grp_mean( _maxRepeat=self.maxRepeat, _DO_RENDER=_DO_RENDER) str2print = ( " [GRP mean] sumRwd:%.3f=cntct:%.2f+ctrl:%.2f+fwd:%.2f+hd:%.2f+srv:%.2f) xD:[%.3f] hD:[%.1f]" % (ret['rSum'], ret['rContactSum'], ret['rCtrlSum'], ret['rFwdSum'], ret['rHeadingSum'], ret['rSrvSum'], ret['xDisp'], ret['hDisp'])) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) # Make video using GRP mean path if _SAVE_VID: outputdata = np.asarray(ret['frames']).astype(np.uint8) folderPath = 'vids/%s' % (self.name) if not os.path.exists(folderPath): os.makedirs(folderPath) vidName = folderPath + '/rollout_dlpg_epoch%03d.mp4' % ( _epoch) skvideo.io.vwrite(vidName, outputdata) str2print = (" Video [%s] saved." % (vidName)) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) # Make GIF if _MAKE_GIF: NSKIP = 3 # For memory issues display_frames_as_gif(ret['frames'][::NSKIP], _intv_ms=20, _figsize=(8, 8), _fontsize=15, _titleStrs=ret['titleStrs'][::NSKIP]) # Plot sampled trajectories if _PLOT_GRP: nrTrajectories2plot = 5 for _i in range(nrTrajectories2plot): # np.random.seed(seed=_i) sampledX = self.VAE.sample(_sess=self.sess).reshape( (self.nAnchor, self.env.actDim)) sampledX[-1, :] = sampledX[0, :] # Clip sampledX = np.clip(sampledX, a_min=-0.2, a_max=1.2) if self.NORMALIZE_SCALE: sampledX = (sampledX - sampledX.min()) / ( sampledX.max() - sampledX.min()) self.set_anchor_grp_posterior(_anchors=sampledX, _levBtw=levBtw) fig = self.GRPposterior.plot_all(_nPath=1, _figsize=(8, 3)) # Save image folderPath = 'pics/%s' % (self.name) if not os.path.exists(folderPath): os.makedirs(folderPath) saveName = folderPath + '/grp_epoch%04d_%d.png' % ( _epoch, _i) fig.savefig(saveName) # Rollout _, ret = self.unit_rollout_from_grp_mean( _maxRepeat=self.maxRepeat, _DO_RENDER=False) str2print = ( " [GRP-%d] sumRwd:%.3f=cntct:%.2f+ctrl:%.2f+fwd:%.2f+hd:%.2f+srv:%.2f) xD:[%.3f] hD:[%.1f]" % (_i, ret['rSum'], ret['rContactSum'], ret['rCtrlSum'], ret['rFwdSum'], ret['rHeadingSum'], ret['rSrvSum'], ret['xDisp'], ret['hDisp'])) print_n_txt(_f=self.f, _chars=str2print, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) # Save network every if ((_epoch % _SAVE_NET_EVERY) == 0) | (_epoch == (_maxEpoch - 1)): folderPath = 'nets/%s' % (self.name) if not os.path.exists(folderPath): os.makedirs(folderPath) saveName = folderPath + '/net_dlpg_epoch%04d.npz' % (_epoch) self.save_net(_sess=_sess, _savename=saveName)
def __init__(self, _name='Ant', _headingCoef=1e-4, _tMax=3, _nAnchor=20, _maxRepeat=3, _hypGainPrior=1 / 3, _hypLenPrior=1 / 4, _hypGainPost=1 / 3, _hypLenPost=1 / 4, _levBtw=0.8, _pGain=0.01, _zDim=16, _hDims=[64, 64], _vaeActv=tf.nn.elu, _vaeOutActv=tf.nn.sigmoid, _vaeQactv=None, _entRegCoef=1e-2, _klMinTh=0.0, _PLOT_GRP=True, _SAVE_TXT=True, _VERBOSE=True): # Some parameters self.name = _name self.headingCoef = _headingCoef self.tMin = 0 self.tMax = _tMax self.nAnchor = _nAnchor self.maxRepeat = _maxRepeat self.SAVE_TXT = _SAVE_TXT self.VERBOSE = _VERBOSE # Noramlize trajecotry self.NORMALIZE_SCALE = False if self.SAVE_TXT: folderPath = 'results' if not os.path.exists(folderPath): os.makedirs(folderPath) txtName = 'results/' + self.name + '.txt' self.f = open(txtName, 'w') # Open txt file print_n_txt(_f=self.f, _chars='Text name: ' + txtName, _DO_PRINT=True, _DO_SAVE=self.SAVE_TXT) # Initialize Ant gym self.env = AntEnvCustom(_headingCoef=self.headingCoef) # GRP sampler (prior) nDataPrior = 2 nTest = (int)((self.tMax - self.tMin) / self.env.dt) tData = np.linspace(start=self.tMin, stop=self.tMax, num=nDataPrior).reshape((-1, 1)) xData = np.random.rand(nDataPrior, self.env.actDim) # Random positions # xData[0,:] = (xData[0,:]+xData[-1,:])/2.0 xData[-1, :] = xData[0, :] lData = np.ones(shape=(nDataPrior, 1)) tTest = np.linspace(start=self.tMin, stop=self.tMax, num=nTest).reshape((-1, 1)) lTest = np.ones(shape=(nTest, 1)) # hyp = {'gain':1/3,'len':1/4,'noise':1e-8} # <= This worked fine hypPrior = {'gain': _hypGainPrior, 'len': _hypLenPrior, 'noise': 1e-10} self.GRPprior = lgrp_class(_name='GPR Prior', _tData=tData, _xData=xData, _lData=lData, _tTest=tTest, _lTest=lTest, _hyp=hypPrior) # GRP posterior tData = np.linspace(start=self.tMin, stop=self.tMax, num=self.nAnchor).reshape((-1, 1)) xData = np.random.rand(self.nAnchor, self.env.actDim) # Random positions lData = np.ones(shape=(self.nAnchor, 1)) lData[1:self.nAnchor - 1] = _levBtw hypPost = {'gain': _hypGainPost, 'len': _hypLenPost, 'noise': 1e-10} self.GRPposterior = lgrp_class(_name='GPR Posterior', _tData=tData, _xData=xData, _lData=lData, _tTest=tTest, _lTest=lTest, _hyp=hypPost) if _PLOT_GRP: self.GRPprior.plot_all(_nPath=10, _figsize=(12, 4)) self.GRPposterior.plot_all(_nPath=10, _figsize=(12, 4)) # PID controller (Kp=0.01,Ki=0.00001,Kd=0.002,windup=5000) self.PID = PID_class(Kp=_pGain, Ki=0.00001, Kd=0.002, windup=5000, sample_time=self.env.dt, dim=self.env.actDim) # VAE (this will be our policy function) optm = tf.train.AdamOptimizer optmParam = { 'lr': 0.0005, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-8 } # optm = tf.train.GradientDescentOptimizer # optmParam = {'lr':0.002} self.VAE = vae_class(_name=self.name, _xDim=self.nAnchor * self.env.actDim, _zDim=_zDim, _hDims=_hDims, _cDim=0, _actv=_vaeActv, _outActv=_vaeOutActv, _qActv=_vaeQactv, _bn=None, _entRegCoef=_entRegCoef, _klMinTh=_klMinTh, _optimizer=optm, _optm_param=optmParam, _VERBOSE=False) # Reward Scaler self.qScaler = Scaler(1) # Check parameters self.check_params()