def __init__(self, env, num_inputs, action_space, args, running_state=None): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.args = args self.env = env self.running_state = running_state self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = args.device self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__(self, actor, critic, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE,MIN_MEMORY=1e3 ) : self.actor = actor self.critic = critic self.target_actor = copy.deepcopy(actor) self.target_critic = copy.deepcopy(critic) self.use_cuda = use_cuda if self.use_cuda : self.actor = self.actor.cuda() self.target_actor = self.target_actor.cuda() self.critic = self.critic.cuda() self.target_critic = self.target_critic.cuda() self.memory = memory self.gamma = GAMMA self.lr = LR self.tau = TAU self.batch_size = BATCH_SIZE self.MIN_MEMORY = MIN_MEMORY self.mutex = Lock() self.noise = OrnsteinUhlenbeckNoise(self.actor.action_dim) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.algo = algo if self.algo == 'pddpg' : self.previous_actor = copy.deepcopy(self.actor) self.epsilon = 0.2 if self.use_cuda : self.previous_actor = self.previous_actor.cuda()
def __init__(self, actor, critic, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE,MIN_MEMORY=1e3 ) : self.actor = actor self.critic = critic self.target_actor = copy.deepcopy(actor) self.target_critic = copy.deepcopy(critic) self.use_cuda = use_cuda if self.use_cuda : self.actor = self.actor.cuda() self.target_actor = self.target_actor.cuda() self.critic = self.critic.cuda() self.target_critic = self.target_critic.cuda() self.memory = memory self.gamma = GAMMA self.lr = LR self.tau = TAU self.batch_size = BATCH_SIZE self.MIN_MEMORY = MIN_MEMORY self.optimizer_actor = optim.Adam(self.actor.parameters(), self.lr) self.optimizer_critic = optim.Adam(self.critic.parameters(), self.lr*1e1) self.noise = OrnsteinUhlenbeckNoise(self.actor.action_dim) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.algo = algo
def __init__(self,index,model,env,memory,lr=1e-3,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10,TAU=1e-3) : self.index = index self.model = model self.wmodel = copy.deepcopy(model) hard_update(self.wmodel,self.model) global use_cuda if use_cuda : self.wmodel = self.wmodel.cuda() self.envstr = env self.env = gym.make(self.envstr) self.env.reset() self.memory = memory self.lr = lr self.TAU = TAU self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr ) bashlogger.info('Optimizer {}: ok.'.format(self.index) ) self.preprocess = preprocess self.path = path self.frompath = frompath self.num_episodes = num_episodes self.epsend = epsend self.epsstart = epsstart self.epsdecay = epsdecay self.sl = statsLogger(path=self.path,filename='logs{}.csv'.format(self.index) ) self.workerfn = lambda: self.train(model=self.wmodel, env=self.env, memory=self.memory, optimizer=self.optimizer, logger=self.sl, preprocess=self.preprocess, path=self.path, frompath=self.frompath, num_episodes=self.num_episodes, epsend=self.epsend, epsstart=self.epsstart, epsdecay=self.epsdecay) self.thread = threading.Thread(target=self.workerfn)
def from_worker2model(self): self.model.lock() self.optimizer.zero_grad() decay_loss = 0.5 * sum( [torch.mean(param * param) for param in self.model.parameters()]) decay_loss.backward() for wparam, mparam in zip(self.wmodel.parameters(), self.model.parameters()): if mparam.grad is not None: mparam.grad = mparam.grad + wparam.grad self.optimizer.step() #update wmodel : hard_update(self.wmodel, self.model) #zero the working model gradients : self.wmodel.zero_grad() self.model.unlock()
def __init__(self, NN, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE ) : self.NN = NN self.target_NN = copy.deepcopy(NN) self.use_cuda = use_cuda if self.use_cuda : self.NN = self.NN.cuda() self.target_NN = self.target_NN.cuda() self.memory = memory self.gamma = GAMMA self.lr = LR self.tau = TAU self.batch_size = BATCH_SIZE self.optimizer = optim.Adam(self.NN.parameters(), self.lr) self.noise = OrnsteinUhlenbeckNoise(self.NN.action_dim) hard_update(self.target_NN,self.NN ) self.algo = algo
def __init__(self, NN, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE,MIN_MEMORY=1e3 ) : self.NN = NN self.target_NN = copy.deepcopy(NN) self.use_cuda = use_cuda if self.use_cuda : self.NN = self.NN.cuda() self.target_NN = self.target_NN.cuda() self.memory = memory self.gamma = GAMMA self.lr = LR self.tau = TAU self.batch_size = BATCH_SIZE self.MIN_MEMORY = MIN_MEMORY self.mutex = Lock() self.noise = OrnsteinUhlenbeckNoise(self.NN.action_dim) hard_update(self.target_NN, self.NN ) self.algo = algo
def __init__(self, state_dim, action_dim, option_dim, max_action, action_space): self.alpha = 0.2 self.lr = 0.0003 self.option_num = option_dim self.policy_type = "Gaussian" self.target_update_interval = 1 self.automatic_entropy_tuning = True self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") """ critic network """ self.critic = QNetwork(state_dim, action_dim, 400).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(state_dim, action_dim, 400).to(self.device) hard_update(self.critic_target, self.critic) self.sampling_prob = torch.FloatTensor(state).to(self.device) # ===================================================================== # # Option Model # # ===================================================================== # self.option_state_input, self.option_action_input, self.option_input_concat, self.option_out_dec, \ self.option_out, self.option_out_noise, self.option_model = self.create_option_model() Advantage = np.stop_gradient(self.target_q_value - self.predicted_v_value) Weight = np.divide(np.exp(Advantage - np.max(Advantage)), self.sampling_prob) W_norm = Weight / K.mean(Weight) critic_conditional_entropy = weighted_entropy(self.option_out, tf.stop_gradient(W_norm)) p_weighted_ave = weighted_mean(self.option_out, tf.stop_gradient(W_norm)) self.critic_entropy = critic_conditional_entropy - self.c_ent * entropy( p_weighted_ave) self.vat_loss = kl(self.option_out, self.option_out_noise) self.reg_loss = metrics.mean_absolute_error(self.option_input_concat, self.option_out_dec) self.option_loss = self.reg_loss + self.entropy_coeff * ( self.critic_entropy) + self.c_reg * self.vat_loss self.option_optimize = tf.train.AdamOptimizer(self.option_lr).minimize( self.option_loss) """ option network """ self.it = 0 if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(state_dim, action_dim, 400, max_action).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) elif self.policy_type == "Multi_Gaussian": if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(state_dim, action_dim, 400, max_action).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(state_dim, action_dim, 400, max_action).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)
def train(self, model, env, memory, optimizer, logger=None, preprocess=T.ToTensor(), path=None, frompath=None, num_episodes=1000, epsend=0.05, epsstart=0.9, epsdecay=200, k=4, strategy='future', singlegoal=False): try: episode_durations = [] episode_reward = [] episode_loss = [] global rendering global use_cuda global MAX_STEPS #exploration counter ; steps_done = [0] #Double Network initialization : savemodel(model, path + '.save') model_ = copy.deepcopy(model) hard_update(model_, model) model_.eval() if use_cuda: model_ = model_.cuda() self.accumulateMemory(memory, env, model, preprocess, epsstart=0.5, epsend=0.3, epsdecay=200, k=k, strategy=strategy) for i in range(num_episodes): bashlogger.info('Episode : {} : memory : {}/{}'.format( i, len(memory), memory.capacity)) cumul_reward = 0.0 last_screen = get_screen_reset(env, preprocess=preprocess) current_screen, reward, done, info = get_screen( env, env.action_space.sample(), preprocess=preprocess) state = current_screen - last_screen episode_buffer = [] meanfreq = 0 episode_loss_buffer = [] #HER : sample initial goal : if not singlegoal: init_goal = sample_init_goal(memory) else: init_goal = torch.zeros(current_screen.size()) showcount = 0 for t in count(): #model.eval() #HER : stategoal = torch.cat([state, init_goal], dim=1) #action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) action = select_action(model, stategoal, steps_done=steps_done, epsend=epsend, epsstart=epsstart, epsdecay=epsdecay) last_screen = current_screen current_screen, reward, done, info = get_screen( env, action[0, 0], preprocess=preprocess) cumul_reward += reward if rendering: if showcount >= 10: showcount = 0 render(current_screen) #env.render() else: showcount += 1 if not done: next_state = current_screen - last_screen else: next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state, action, next_state, reward, done)) state = next_state # OPTIMIZE MODEL : since = time.time() lossnp = self.optimize_model(model, model_, memory, optimizer) if lossnp is not None: episode_loss_buffer.append(np.mean(lossnp)) else: episode_loss_buffer.append(0) # SOFT UPDATE : soft_update(model_, model, self.TAU) elt = time.time() - since f = 1.0 / elt meanfreq = (meanfreq * (t + 1) + f) / (t + 2) if done or t > MAX_STEPS: self.from_worker2model() ''' nbrTrain = 200 for it in range(nbrTrain) : since = time.time() lossnp = optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(it+1) + f)/(it+2) #print('{} Hz ; {} seconds.'.format(f,elt) ) ''' episode_durations.append(t + 1) episode_reward.append(cumul_reward) meanloss = np.mean(episode_loss_buffer) episode_loss.append(meanloss) log = 'Episode duration : {}'.format( t + 1 ) + '---' + ' Reward : {} // Mean Loss : {}'.format( cumul_reward, meanloss) + '---' + ' {}Hz'.format(meanfreq) bashlogger.info(log) if logger is not None: new = { 'episodes': [i], 'duration': [t + 1], 'reward': [cumul_reward], 'mean frequency': [meanfreq], 'loss': [meanloss] } logger.append(new) if path is not None: # SAVE THE MAIN MODEL : self.model.lock() savemodel(self.model, path + '.save') self.model.unlock() bashlogger.info('Model saved : {}'.format(path)) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for itexp in range(len(episode_buffer)): el = episode_buffer[itexp] #HER : reward with init_goal HERreward = reward_function(el.state, init_goal) reward = HERreward + el.reward #store this transition with init_goal : init_el = EXP( state=torch.cat([el.state, init_goal], dim=1), action=el.action, next_state=torch.cat([el.next_state, init_goal], dim=1), reward=reward, done=el.done) init_priority = memory.priority( torch.abs(init_el.reward).numpy()) memory.add(init_el, init_priority) #store for multiple goals : #1: sample new goal : goals = [] for j in range(k): goal = None if strategy == 'final': goal = sample_goal(episode_buffer, strategy=strategy) elif strategy == 'future': # watch out for the empty set... index = min(len(episode_buffer) - 3, itexp) goal = sample_goal(episode_buffer, strategy=index) goals.append(goal) #For each goal ... for goal in goals: #2: .. compute reward : goalreward = reward_function(el.state, goal) + el.reward #3: ... store this transistion with goal : goalel = EXP(state=torch.cat([el.state, goal], dim=1), action=el.action, next_state=torch.cat( [el.next_state, goal], dim=1), reward=goalreward, done=el.done) init_priority = memory.priority( torch.abs(goalel.reward).numpy()) memory.add(goalel, init_priority) del el del goals del episode_buffer bashlogger.info('Complete') if path is not None: savemodel(model, path + '.save') bashlogger.info('Model saved : {}'.format(path)) env.close() except Exception as e: bashlogger.exception(e)
def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10): try : episode_durations = [] episode_reward = [] episode_loss = [] global rendering global use_cuda global MAX_STEPS #exploration counter ; steps_done = [0] #Double Network initialization : savemodel(model,path+'.save') #model_ = DuelingDQN(model.nbr_actions) model_ = copy.deepcopy(model) hard_update(model_,model) model_.eval() if use_cuda : model_ = model_.cuda() for i in range(num_episodes) : bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) ) cumul_reward = 0.0 last_screen = get_screen_reset(env,preprocess=preprocess) current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess ) state = current_screen - last_screen episode_buffer = [] meanfreq = 0 episode_loss_buffer = [] episode_qsa_buffer = [] showcount = 0 for t in count() : action,qsa = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) episode_qsa_buffer.append(qsa) last_screen = current_screen current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess) cumul_reward += reward if rendering : if showcount >= 10 : showcount = 0 render(current_screen)#env.render() else : showcount +=1 reward = FloatTensor([reward]) if not done : next_state = current_screen -last_screen else : next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state,action,next_state,reward,done) ) state = next_state # OPTIMIZE MODEL : since = time.time() lossnp = self.optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) # SOFT UPDATE : soft_update(model_,model,self.TAU) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(t+1) + f)/(t+2) if done or t > MAX_STEPS: self.from_worker2model() ''' nbrTrain = 2 for tr in range(nbrTrain) : since = time.time() lossnp = optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(tr+1) + f)/(tr+2) #print('{} Hz ; {} seconds.'.format(f,elt) ) ''' episode_durations.append(t+1) episode_reward.append(cumul_reward) meanloss = np.mean(episode_loss_buffer) episode_loss.append(meanloss) meanqsa = np.mean(episode_qsa_buffer) log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {} // QSA : {}'.format(cumul_reward,meanloss,meanqsa) +'---'+' {}Hz'.format(meanfreq) bashlogger.info(log) if logger is not None : new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]} logger.append(new) if path is not None : # SAVE THE MAIN MODEL : self.model.lock() savemodel(self.model,path+'.save') self.model.unlock() bashlogger.info('Model saved : {}'.format(path) ) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for el in episode_buffer : init_priority = memory.priority( torch.abs(el.reward).cpu().numpy() ) memory.add(el,init_priority) del episode_buffer bashlogger.info('Complete') if path is not None : savemodel(model,path+'.save') bashlogger.info('Model saved : {}'.format(path) ) env.close() except Exception as e : bashlogger.exception(e)
def __init__(self, logger, obs_dim, action_space, userconfig): super().__init__(logger=logger, obs_dim=obs_dim, action_dim=4, userconfig=userconfig) self.action_space = action_space self.device = userconfig['device'] self.alpha = userconfig['alpha'] self.automatic_entropy_tuning = self._config[ 'automatic_entropy_tuning'] self.eval_mode = False if self._config['lr_milestones'] is None: raise ValueError( 'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300' ) lr_milestones = [ int(x) for x in (self._config['lr_milestones'][0]).split(' ') ] self.actor = ActorNetwork(input_dims=obs_dim, learning_rate=self._config['learning_rate'], action_space=self.action_space, hidden_sizes=[256, 256], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.critic = CriticNetwork( input_dim=obs_dim, n_actions=4, learning_rate=self._config['learning_rate'], hidden_sizes=[256, 256], lr_milestones=lr_milestones, lr_factor=self._config['lr_factor'], device=self._config['device']) self.critic_target = CriticNetwork( input_dim=obs_dim, n_actions=4, learning_rate=self._config['learning_rate'], hidden_sizes=[256, 256], lr_milestones=lr_milestones, device=self._config['device']) hard_update(self.critic_target, self.critic) if self.automatic_entropy_tuning: milestones = [ int(x) for x in (self._config['alpha_milestones'][0]).split(' ') ] self.target_entropy = -torch.tensor(4).to(self.device) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self._config['alpha_lr']) self.alpha_scheduler = torch.optim.lr_scheduler.MultiStepLR( self.alpha_optim, milestones=milestones, gamma=0.5)
def optimize(self,optimizer_critic,optimizer_actor) : ''' self.target_critic.eval() self.target_actor.eval() self.critic.train() self.actor.train() ''' if self.algo == 'ddpg' : try : if len(self.memory) < self.MIN_MEMORY : return #Create Batch self.mutex.acquire() if isinstance(self.memory, PrioritizedReplayBuffer) : #with PR : prioritysum = self.memory.total() randexp = np.random.random(size=self.batch_size)*prioritysum batch = list() for i in range(self.batch_size): try : el = self.memory.get(randexp[i]) batch.append(el) except TypeError as e : continue #print('REPLAY BUFFER EXCEPTION...') else : #with random RB : batch = self.memory.sample(self.batch_size) self.mutex.release() if len(batch) == 0 : return # Create Batch with replayMemory : batch = TransitionPR( *zip(*batch) ) next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False) state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False) action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False) reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1)) terminal_batch = Variable( torch.cat( batch.done ) ) ''' next_state_batch = Variable(torch.cat( batch.next_state) ) state_batch = Variable( torch.cat( batch.state) ) action_batch = Variable( torch.cat( batch.action) ) reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1)) ''' if self.use_cuda : next_state_batch = next_state_batch.cuda() state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() terminal_batch = terminal_batch.cuda() # Critic : #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() # sample action from next_state, without gradient repercusion : next_taction = self.target_actor(next_state_batch).detach() # evaluate the next state action over the target, without repercusion (faster...) : next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1)) # Supervise loss : ## y_true : y_true = reward_batch + (1.0-terminal_batch)*self.gamma*next_tqsa #print(torch.cat([y_true.view((-1,1)),terminal_batch.view((-1,1))],dim=1) ) ## y_pred : y_pred = torch.squeeze( self.critic(state_batch,action_batch) ) ## loss : critic_loss = F.smooth_l1_loss(y_pred,y_true) #criterion = nn.MSELoss() #critic_loss = criterion(y_pred,y_true) critic_loss.backward() #weight decay : decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.critic.parameters()]) decay_loss.backward() #clamping : #torch.nn.utils.clip_grad_norm(self.critic.parameters(),50) optimizer_critic.step() ################################### ''' # Actor : #predict action : pred_action = self.actor(state_batch) #predict associated qvalues : pred_qsa = self.critic(state_batch, pred_action) #pred_qsa = self.target_critic(state_batch, pred_action) # loss : actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) ) #actor_loss = F.smooth_l1_loss( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() ) #criterion = nn.MSELoss() #actor_loss = criterion( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() ) #before optimization : optimizer_actor.zero_grad() self.actor.zero_grad() actor_loss.backward() #clamping : #clampactor = 1e2#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] ) #torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor) optimizer_actor.step() ################################### ''' ################################### # Actor : #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() #predict action : pred_action = self.actor(state_batch) var_action = Variable( pred_action.cpu().data, requires_grad=True) if self.use_cuda : var_action_c = var_action.cuda() pred_qsa = self.critic(state_batch, var_action_c) else : pred_qsa = self.critic(state_batch, var_action) #predict associated qvalues : gradout = torch.ones(pred_qsa.size()) if self.use_cuda: gradout = gradout.cuda() pred_qsa.backward( gradout ) if self.use_cuda : gradcritic = var_action.grad.data.cuda() else : gradcritic = var_action.grad.data pred_action.backward( -gradcritic) #weight decay : decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.actor.parameters()]) decay_loss.backward() #clamping : clampactor = 5e0#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] ) torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor) optimizer_actor.step() # loss : actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) ) ################################### ''' critic_grad = 0.0 for p in self.critic.parameters() : critic_grad += np.mean(p.grad.cpu().data.numpy()) print( 'Mean Critic Grad : {}'.format(critic_grad) ) ''' actor_grad = 0.0 for p in self.actor.parameters() : actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) ) #print( 'Mean Actor Grad : {}'.format(actor_grad) ) #UPDATE THE PR : if isinstance(self.memory, PrioritizedReplayBuffer) : self.mutex.acquire() loss = torch.abs(actor_loss) + torch.abs(critic_loss) #loss = torch.abs(actor_loss) #+ torch.abs(critic_loss) loss_np = loss.cpu().data.numpy() for (idx, new_error) in zip(batch.idx,loss_np) : new_priority = self.memory.priority(new_error) #print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) ) self.memory.update(idx,new_priority) self.mutex.release() except Exception as e : bashlogger.debug('error : {}',format(e) ) raise e # soft update : soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) closs = critic_loss.cpu() aloss = actor_loss.cpu() return closs.data.numpy(), aloss.data.numpy(), actor_grad elif self.algo == 'pddpg' : try : if len(self.memory) < self.MIN_MEMORY : return #Create Batch self.mutex.acquire() if isinstance(self.memory, PrioritizedReplayBuffer) : #with PR : prioritysum = self.memory.total() randexp = np.random.random(size=self.batch_size)*prioritysum batch = list() for i in range(self.batch_size): try : el = self.memory.get(randexp[i]) batch.append(el) except TypeError as e : continue #print('REPLAY BUFFER EXCEPTION...') else : #with random RB : batch = self.memory.sample(self.batch_size) self.mutex.release() if len(batch) == 0 : return # Create Batch with replayMemory : batch = TransitionPR( *zip(*batch) ) next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False) state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False) action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False) reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1)) ''' next_state_batch = Variable(torch.cat( batch.next_state) ) state_batch = Variable( torch.cat( batch.state) ) action_batch = Variable( torch.cat( batch.action) ) reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1)) ''' if self.use_cuda : next_state_batch = next_state_batch.cuda() state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # Critic : # sample action from next_state, without gradient repercusion : next_taction = self.target_actor(next_state_batch).detach() # evaluate the next state action over the target, without repercusion (faster...) : next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1)) # Supervise loss : ## y_true : y_true = reward_batch + self.gamma*next_tqsa ## y_pred : y_pred = torch.squeeze( self.critic(state_batch,action_batch) ) ## loss : critic_loss = F.smooth_l1_loss(y_pred,y_true) #criterion = nn.MSELoss() #critic_loss = criterion(y_pred,y_true) #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() critic_loss.backward() #clamping : #torch.nn.utils.clip_grad_norm(self.critic.parameters(),50) optimizer_critic.step() ################################### # Actor : #predict action with old weights : pred_old_action = self.previous_actor(state_batch) #predict action with current weights : pred_action = self.actor(state_batch) var_action = Variable( pred_action.cpu().data, requires_grad=True) var_old_action = Variable( pred_old_action.cpu().data, requires_grad=True) if self.use_cuda : var_action_c = var_action.cuda() var_old_action_c = var_old_action.cuda() #predict associated qvalues : pred_qsa = self.critic(state_batch, var_action_c) pred_old_qsa = self.critic(state_batch, var_old_action_c) else : #predict associated qvalues : pred_qsa = self.critic(state_batch, var_action) pred_old_qsa = self.critic(state_batch, var_old_action) #helper vars : clipped_m = (1.0-self.epsilon)#*torch.ones(ratio.size()) clipped_p = (1.0+self.epsilon)#*torch.ones(ratio.size()) gradout = torch.ones(pred_qsa.size()) if self.use_cuda: gradout = gradout.cuda() #compute ratios : ratio = pred_qsa/pred_old_qsa clipped_ratio = ratio.clamp(clipped_m,clipped_p) p_actor_loss = torch.min(ratio,clipped_ratio) p_actor_loss.backward( gradout ) #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() if self.use_cuda : gradcritic = var_action.grad.data.cuda() pred_action.backward( -gradcritic) else : pred_action.backward( -var_action.grad.data) #clamping : #clampactor = 5e1#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] ) #torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor) #proximal update, before the update of the weights : hard_update(self.previous_actor, self.actor) #update of the weights : optimizer_actor.step() # loss : actor_loss = -1.0*torch.mean( pred_qsa ) ################################### ''' critic_grad = 0.0 for p in self.critic.parameters() : critic_grad += np.mean(p.grad.cpu().data.numpy()) print( 'Mean Critic Grad : {}'.format(critic_grad) ) ''' actor_grad = 0.0 for p in self.actor.parameters() : actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) ) #print( 'Mean Actor Grad : {}'.format(actor_grad) ) #UPDATE THE PR : if isinstance(self.memory, PrioritizedReplayBuffer) : self.mutex.acquire() loss = torch.abs(actor_loss) + torch.abs(critic_loss) #loss = torch.abs(actor_loss) #+ torch.abs(critic_loss) loss_np = loss.cpu().data.numpy() for (idx, new_error) in zip(batch.idx,loss_np) : new_priority = self.memory.priority(new_error) #print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) ) self.memory.update(idx,new_priority) self.mutex.release() except Exception as e : bashlogger.debug('error : {}',format(e) ) raise e # soft update : soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) closs = critic_loss.cpu() aloss = actor_loss.cpu() return closs.data.numpy(), aloss.data.numpy(), actor_grad else : raise NotImplemented
def load(self, path) : self.actor.load_state_dict( torch.load(path+'.actor') ) hard_update(self.target_actor, self.actor) self.critic.load_state_dict( torch.load(path+'.critic') ) hard_update(self.target_critic, self.critic)
def load(self, path) : self.NN.load_state_dict( torch.load(path) ) hard_update(self.target_NN, self.NN)