def train(self, total_step_counter, iter_fit=32): losses = [] for i in range(iter_fit): data = self.buffer.sample(batch_size=self._config['batch_size']) s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device) s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device) a = torch.FloatTensor(np.stack( data[:, 1])[:, None]).squeeze(dim=1).to(self.device) rew = torch.FloatTensor(np.stack( data[:, 2])[:, None]).squeeze(dim=1).to(self.device) done = torch.FloatTensor(np.stack( data[:, 4])[:, None]).squeeze(dim=1).to(self.device) # done flag noise = torch.FloatTensor(a.cpu()).data.normal_( 0, self._config['noise']).to(self.device) noise = noise.clamp(-self._config['noise_clip'], self._config['noise_clip']) a_next = (self.actor_target(s_next).to(self.device) + noise).clamp( -1, 1) Q1_target, Q2_target = self.critics_target(s_next, a_next) target_Q = torch.min(Q1_target, Q2_target).squeeze(dim=1).to(self.device) # target targets = rew + self._config['discount'] * target_Q * (1.0 - done) # optimize critic targets = targets.to(self.device) Q1_current, Q2_current = self.critics(s, a) Q1_current = Q1_current.squeeze(dim=1).to(self.device) Q2_current = Q2_current.squeeze(dim=1).to(self.device) critic_loss = F.mse_loss(Q1_current, targets) + F.mse_loss( Q2_current, targets) losses.append(critic_loss) self.critics.optimizer.zero_grad() critic_loss.backward() self.critics.optimizer.step() if ((total_step_counter - 1) * iter_fit + i + 1) % self._config['update_target_every'] == 0: # optimize actor actions = self.actor.forward(s) actor_loss = -self.critics.Q1(s, actions).mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # update soft_update(self.critics_target, self.critics, self._tau) soft_update(self.actor_target, self.actor, self._tau) return losses
def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch) qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action) min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target) qf1, qf2 = self.critic(state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()
def train(self, total_step_counter, iter_fit=32): losses = [] for i in range(iter_fit): data = self.buffer.sample(batch_size=self._config['batch_size']) s = torch.FloatTensor(np.stack(data[:, 0])).to(self.device) s_next = torch.FloatTensor(np.stack(data[:, 3])).to(self.device) a = torch.FloatTensor(np.stack( data[:, 1])[:, None]).squeeze(dim=1).to(self.device) rew = torch.FloatTensor(np.stack( data[:, 2])[:, None]).squeeze(dim=1).to(self.device) done = torch.FloatTensor(np.stack( data[:, 4])[:, None]).squeeze(dim=1).to(self.device) # done flag Q_target = self.critic(s, a).squeeze(dim=1).to(self.device) a_next = self.actor_target.forward(s_next) Q_next = self.critic_target.forward( s_next, a_next).squeeze(dim=1).to(self.device) # target targets = rew + self._config['discount'] * Q_next * (1.0 - done) # optimize critic targets = targets.to(self.device) critic_loss = self.critic.loss(Q_target.float(), targets.float()) losses.append(critic_loss) self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() actions = self.actor.forward(s) actor_loss = -self.critic.forward(s, actions).mean() self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # update if (total_step_counter) % self._config['update_target_every'] == 0: # optimize actor soft_update(self.critic_target, self.critic, self._tau) soft_update(self.actor_target, self.actor, self._tau) return losses
def step(self, state, action, reward, next_state, done): for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) self.step_idx += 1 is_learn_iteration = (self.step_idx % self.opts.learn_every) == 0 is_update_iteration = (self.step_idx % self.opts.update_every) == 0 if len(self.memory) > self.opts.batch_size: if is_learn_iteration: experiences = self.memory.sample() self.learn(experiences, self.opts.gamma) if is_update_iteration: soft_update(self.critic_local, self.critic_target, self.opts.tau) soft_update(self.actor_local, self.actor_target, self.opts.tau)
def train(self, model, env, memory, optimizer, logger=None, preprocess=T.ToTensor(), path=None, frompath=None, num_episodes=1000, epsend=0.05, epsstart=0.9, epsdecay=200, k=4, strategy='future', singlegoal=False): try: episode_durations = [] episode_reward = [] episode_loss = [] global rendering global use_cuda global MAX_STEPS #exploration counter ; steps_done = [0] #Double Network initialization : savemodel(model, path + '.save') model_ = copy.deepcopy(model) hard_update(model_, model) model_.eval() if use_cuda: model_ = model_.cuda() self.accumulateMemory(memory, env, model, preprocess, epsstart=0.5, epsend=0.3, epsdecay=200, k=k, strategy=strategy) for i in range(num_episodes): bashlogger.info('Episode : {} : memory : {}/{}'.format( i, len(memory), memory.capacity)) cumul_reward = 0.0 last_screen = get_screen_reset(env, preprocess=preprocess) current_screen, reward, done, info = get_screen( env, env.action_space.sample(), preprocess=preprocess) state = current_screen - last_screen episode_buffer = [] meanfreq = 0 episode_loss_buffer = [] #HER : sample initial goal : if not singlegoal: init_goal = sample_init_goal(memory) else: init_goal = torch.zeros(current_screen.size()) showcount = 0 for t in count(): #model.eval() #HER : stategoal = torch.cat([state, init_goal], dim=1) #action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) action = select_action(model, stategoal, steps_done=steps_done, epsend=epsend, epsstart=epsstart, epsdecay=epsdecay) last_screen = current_screen current_screen, reward, done, info = get_screen( env, action[0, 0], preprocess=preprocess) cumul_reward += reward if rendering: if showcount >= 10: showcount = 0 render(current_screen) #env.render() else: showcount += 1 if not done: next_state = current_screen - last_screen else: next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state, action, next_state, reward, done)) state = next_state # OPTIMIZE MODEL : since = time.time() lossnp = self.optimize_model(model, model_, memory, optimizer) if lossnp is not None: episode_loss_buffer.append(np.mean(lossnp)) else: episode_loss_buffer.append(0) # SOFT UPDATE : soft_update(model_, model, self.TAU) elt = time.time() - since f = 1.0 / elt meanfreq = (meanfreq * (t + 1) + f) / (t + 2) if done or t > MAX_STEPS: self.from_worker2model() ''' nbrTrain = 200 for it in range(nbrTrain) : since = time.time() lossnp = optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(it+1) + f)/(it+2) #print('{} Hz ; {} seconds.'.format(f,elt) ) ''' episode_durations.append(t + 1) episode_reward.append(cumul_reward) meanloss = np.mean(episode_loss_buffer) episode_loss.append(meanloss) log = 'Episode duration : {}'.format( t + 1 ) + '---' + ' Reward : {} // Mean Loss : {}'.format( cumul_reward, meanloss) + '---' + ' {}Hz'.format(meanfreq) bashlogger.info(log) if logger is not None: new = { 'episodes': [i], 'duration': [t + 1], 'reward': [cumul_reward], 'mean frequency': [meanfreq], 'loss': [meanloss] } logger.append(new) if path is not None: # SAVE THE MAIN MODEL : self.model.lock() savemodel(self.model, path + '.save') self.model.unlock() bashlogger.info('Model saved : {}'.format(path)) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for itexp in range(len(episode_buffer)): el = episode_buffer[itexp] #HER : reward with init_goal HERreward = reward_function(el.state, init_goal) reward = HERreward + el.reward #store this transition with init_goal : init_el = EXP( state=torch.cat([el.state, init_goal], dim=1), action=el.action, next_state=torch.cat([el.next_state, init_goal], dim=1), reward=reward, done=el.done) init_priority = memory.priority( torch.abs(init_el.reward).numpy()) memory.add(init_el, init_priority) #store for multiple goals : #1: sample new goal : goals = [] for j in range(k): goal = None if strategy == 'final': goal = sample_goal(episode_buffer, strategy=strategy) elif strategy == 'future': # watch out for the empty set... index = min(len(episode_buffer) - 3, itexp) goal = sample_goal(episode_buffer, strategy=index) goals.append(goal) #For each goal ... for goal in goals: #2: .. compute reward : goalreward = reward_function(el.state, goal) + el.reward #3: ... store this transistion with goal : goalel = EXP(state=torch.cat([el.state, goal], dim=1), action=el.action, next_state=torch.cat( [el.next_state, goal], dim=1), reward=goalreward, done=el.done) init_priority = memory.priority( torch.abs(goalel.reward).numpy()) memory.add(goalel, init_priority) del el del goals del episode_buffer bashlogger.info('Complete') if path is not None: savemodel(model, path + '.save') bashlogger.info('Model saved : {}'.format(path)) env.close() except Exception as e: bashlogger.exception(e)
def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10): try : episode_durations = [] episode_reward = [] episode_loss = [] global rendering global use_cuda global MAX_STEPS #exploration counter ; steps_done = [0] #Double Network initialization : savemodel(model,path+'.save') #model_ = DuelingDQN(model.nbr_actions) model_ = copy.deepcopy(model) hard_update(model_,model) model_.eval() if use_cuda : model_ = model_.cuda() for i in range(num_episodes) : bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) ) cumul_reward = 0.0 last_screen = get_screen_reset(env,preprocess=preprocess) current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess ) state = current_screen - last_screen episode_buffer = [] meanfreq = 0 episode_loss_buffer = [] episode_qsa_buffer = [] showcount = 0 for t in count() : action,qsa = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay) episode_qsa_buffer.append(qsa) last_screen = current_screen current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess) cumul_reward += reward if rendering : if showcount >= 10 : showcount = 0 render(current_screen)#env.render() else : showcount +=1 reward = FloatTensor([reward]) if not done : next_state = current_screen -last_screen else : next_state = torch.zeros(current_screen.size()) episode_buffer.append( EXP(state,action,next_state,reward,done) ) state = next_state # OPTIMIZE MODEL : since = time.time() lossnp = self.optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) # SOFT UPDATE : soft_update(model_,model,self.TAU) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(t+1) + f)/(t+2) if done or t > MAX_STEPS: self.from_worker2model() ''' nbrTrain = 2 for tr in range(nbrTrain) : since = time.time() lossnp = optimize_model(model,model_,memory,optimizer) if lossnp is not None : episode_loss_buffer.append( np.mean(lossnp) ) else : episode_loss_buffer.append(0) elt = time.time() - since f = 1.0/elt meanfreq = (meanfreq*(tr+1) + f)/(tr+2) #print('{} Hz ; {} seconds.'.format(f,elt) ) ''' episode_durations.append(t+1) episode_reward.append(cumul_reward) meanloss = np.mean(episode_loss_buffer) episode_loss.append(meanloss) meanqsa = np.mean(episode_qsa_buffer) log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {} // QSA : {}'.format(cumul_reward,meanloss,meanqsa) +'---'+' {}Hz'.format(meanfreq) bashlogger.info(log) if logger is not None : new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]} logger.append(new) if path is not None : # SAVE THE MAIN MODEL : self.model.lock() savemodel(self.model,path+'.save') self.model.unlock() bashlogger.info('Model saved : {}'.format(path) ) #plot_durations() break #Let us add this episode_buffer to the replayBuffer : for el in episode_buffer : init_priority = memory.priority( torch.abs(el.reward).cpu().numpy() ) memory.add(el,init_priority) del episode_buffer bashlogger.info('Complete') if path is not None : savemodel(model,path+'.save') bashlogger.info('Model saved : {}'.format(path) ) env.close() except Exception as e : bashlogger.exception(e)
def update_parameters(self, total_step): data = self.buffer.sample(self._config['batch_size']) state = torch.FloatTensor(np.stack(data[:, 0]), device=self.device) next_state = torch.FloatTensor(np.stack(data[:, 3]), device=self.device) action = torch.FloatTensor(np.stack(data[:, 1])[:, None], device=self.device).squeeze(dim=1) reward = torch.FloatTensor(np.stack(data[:, 2])[:, None], device=self.device).squeeze(dim=1) not_done = torch.FloatTensor( (~np.stack(data[:, 4])[:, None]).astype(np.int), device=self.device).squeeze(dim=1) with torch.no_grad(): next_state_action, next_state_log_pi, _, _ = self.actor.sample( next_state) q1_next_targ, q2_next_targ = self.critic_target( next_state, next_state_action) min_qf_next_target = torch.min( q1_next_targ, q2_next_targ) - self.alpha * next_state_log_pi next_q_value = reward + not_done * self._config['gamma'] * ( min_qf_next_target).squeeze() qf1, qf2 = self.critic(state, action) qf1_loss = self.critic.loss(qf1.squeeze(), next_q_value) qf2_loss = self.critic.loss(qf2.squeeze(), next_q_value) qf_loss = qf1_loss + qf2_loss self.critic.optimizer.zero_grad() qf_loss.backward() self.critic.optimizer.step() pi, log_pi, _, _ = self.actor.sample(state) qf1_pi, qf2_pi = self.critic(state, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(axis=0) self.actor.optimizer.zero_grad() policy_loss.backward() self.actor.optimizer.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha_scheduler.step() self.alpha = self.log_alpha.exp() else: alpha_loss = torch.tensor(0.).to(self.device) if total_step % self._config['update_target_every'] == 0: soft_update(self.critic_target, self.critic, self._config['soft_tau']) return (qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item())
def optimize(self,optimizer_critic,optimizer_actor) : ''' self.target_critic.eval() self.target_actor.eval() self.critic.train() self.actor.train() ''' if self.algo == 'ddpg' : try : if len(self.memory) < self.MIN_MEMORY : return #Create Batch self.mutex.acquire() if isinstance(self.memory, PrioritizedReplayBuffer) : #with PR : prioritysum = self.memory.total() randexp = np.random.random(size=self.batch_size)*prioritysum batch = list() for i in range(self.batch_size): try : el = self.memory.get(randexp[i]) batch.append(el) except TypeError as e : continue #print('REPLAY BUFFER EXCEPTION...') else : #with random RB : batch = self.memory.sample(self.batch_size) self.mutex.release() if len(batch) == 0 : return # Create Batch with replayMemory : batch = TransitionPR( *zip(*batch) ) next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False) state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False) action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False) reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1)) terminal_batch = Variable( torch.cat( batch.done ) ) ''' next_state_batch = Variable(torch.cat( batch.next_state) ) state_batch = Variable( torch.cat( batch.state) ) action_batch = Variable( torch.cat( batch.action) ) reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1)) ''' if self.use_cuda : next_state_batch = next_state_batch.cuda() state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() terminal_batch = terminal_batch.cuda() # Critic : #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() # sample action from next_state, without gradient repercusion : next_taction = self.target_actor(next_state_batch).detach() # evaluate the next state action over the target, without repercusion (faster...) : next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1)) # Supervise loss : ## y_true : y_true = reward_batch + (1.0-terminal_batch)*self.gamma*next_tqsa #print(torch.cat([y_true.view((-1,1)),terminal_batch.view((-1,1))],dim=1) ) ## y_pred : y_pred = torch.squeeze( self.critic(state_batch,action_batch) ) ## loss : critic_loss = F.smooth_l1_loss(y_pred,y_true) #criterion = nn.MSELoss() #critic_loss = criterion(y_pred,y_true) critic_loss.backward() #weight decay : decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.critic.parameters()]) decay_loss.backward() #clamping : #torch.nn.utils.clip_grad_norm(self.critic.parameters(),50) optimizer_critic.step() ################################### ''' # Actor : #predict action : pred_action = self.actor(state_batch) #predict associated qvalues : pred_qsa = self.critic(state_batch, pred_action) #pred_qsa = self.target_critic(state_batch, pred_action) # loss : actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) ) #actor_loss = F.smooth_l1_loss( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() ) #criterion = nn.MSELoss() #actor_loss = criterion( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() ) #before optimization : optimizer_actor.zero_grad() self.actor.zero_grad() actor_loss.backward() #clamping : #clampactor = 1e2#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] ) #torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor) optimizer_actor.step() ################################### ''' ################################### # Actor : #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() #predict action : pred_action = self.actor(state_batch) var_action = Variable( pred_action.cpu().data, requires_grad=True) if self.use_cuda : var_action_c = var_action.cuda() pred_qsa = self.critic(state_batch, var_action_c) else : pred_qsa = self.critic(state_batch, var_action) #predict associated qvalues : gradout = torch.ones(pred_qsa.size()) if self.use_cuda: gradout = gradout.cuda() pred_qsa.backward( gradout ) if self.use_cuda : gradcritic = var_action.grad.data.cuda() else : gradcritic = var_action.grad.data pred_action.backward( -gradcritic) #weight decay : decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.actor.parameters()]) decay_loss.backward() #clamping : clampactor = 5e0#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] ) torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor) optimizer_actor.step() # loss : actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) ) ################################### ''' critic_grad = 0.0 for p in self.critic.parameters() : critic_grad += np.mean(p.grad.cpu().data.numpy()) print( 'Mean Critic Grad : {}'.format(critic_grad) ) ''' actor_grad = 0.0 for p in self.actor.parameters() : actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) ) #print( 'Mean Actor Grad : {}'.format(actor_grad) ) #UPDATE THE PR : if isinstance(self.memory, PrioritizedReplayBuffer) : self.mutex.acquire() loss = torch.abs(actor_loss) + torch.abs(critic_loss) #loss = torch.abs(actor_loss) #+ torch.abs(critic_loss) loss_np = loss.cpu().data.numpy() for (idx, new_error) in zip(batch.idx,loss_np) : new_priority = self.memory.priority(new_error) #print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) ) self.memory.update(idx,new_priority) self.mutex.release() except Exception as e : bashlogger.debug('error : {}',format(e) ) raise e # soft update : soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) closs = critic_loss.cpu() aloss = actor_loss.cpu() return closs.data.numpy(), aloss.data.numpy(), actor_grad elif self.algo == 'pddpg' : try : if len(self.memory) < self.MIN_MEMORY : return #Create Batch self.mutex.acquire() if isinstance(self.memory, PrioritizedReplayBuffer) : #with PR : prioritysum = self.memory.total() randexp = np.random.random(size=self.batch_size)*prioritysum batch = list() for i in range(self.batch_size): try : el = self.memory.get(randexp[i]) batch.append(el) except TypeError as e : continue #print('REPLAY BUFFER EXCEPTION...') else : #with random RB : batch = self.memory.sample(self.batch_size) self.mutex.release() if len(batch) == 0 : return # Create Batch with replayMemory : batch = TransitionPR( *zip(*batch) ) next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False) state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False) action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False) reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1)) ''' next_state_batch = Variable(torch.cat( batch.next_state) ) state_batch = Variable( torch.cat( batch.state) ) action_batch = Variable( torch.cat( batch.action) ) reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1)) ''' if self.use_cuda : next_state_batch = next_state_batch.cuda() state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # Critic : # sample action from next_state, without gradient repercusion : next_taction = self.target_actor(next_state_batch).detach() # evaluate the next state action over the target, without repercusion (faster...) : next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1)) # Supervise loss : ## y_true : y_true = reward_batch + self.gamma*next_tqsa ## y_pred : y_pred = torch.squeeze( self.critic(state_batch,action_batch) ) ## loss : critic_loss = F.smooth_l1_loss(y_pred,y_true) #criterion = nn.MSELoss() #critic_loss = criterion(y_pred,y_true) #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() critic_loss.backward() #clamping : #torch.nn.utils.clip_grad_norm(self.critic.parameters(),50) optimizer_critic.step() ################################### # Actor : #predict action with old weights : pred_old_action = self.previous_actor(state_batch) #predict action with current weights : pred_action = self.actor(state_batch) var_action = Variable( pred_action.cpu().data, requires_grad=True) var_old_action = Variable( pred_old_action.cpu().data, requires_grad=True) if self.use_cuda : var_action_c = var_action.cuda() var_old_action_c = var_old_action.cuda() #predict associated qvalues : pred_qsa = self.critic(state_batch, var_action_c) pred_old_qsa = self.critic(state_batch, var_old_action_c) else : #predict associated qvalues : pred_qsa = self.critic(state_batch, var_action) pred_old_qsa = self.critic(state_batch, var_old_action) #helper vars : clipped_m = (1.0-self.epsilon)#*torch.ones(ratio.size()) clipped_p = (1.0+self.epsilon)#*torch.ones(ratio.size()) gradout = torch.ones(pred_qsa.size()) if self.use_cuda: gradout = gradout.cuda() #compute ratios : ratio = pred_qsa/pred_old_qsa clipped_ratio = ratio.clamp(clipped_m,clipped_p) p_actor_loss = torch.min(ratio,clipped_ratio) p_actor_loss.backward( gradout ) #before optimization : self.critic.zero_grad() self.actor.zero_grad() optimizer_critic.zero_grad() optimizer_actor.zero_grad() if self.use_cuda : gradcritic = var_action.grad.data.cuda() pred_action.backward( -gradcritic) else : pred_action.backward( -var_action.grad.data) #clamping : #clampactor = 5e1#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] ) #torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor) #proximal update, before the update of the weights : hard_update(self.previous_actor, self.actor) #update of the weights : optimizer_actor.step() # loss : actor_loss = -1.0*torch.mean( pred_qsa ) ################################### ''' critic_grad = 0.0 for p in self.critic.parameters() : critic_grad += np.mean(p.grad.cpu().data.numpy()) print( 'Mean Critic Grad : {}'.format(critic_grad) ) ''' actor_grad = 0.0 for p in self.actor.parameters() : actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) ) #print( 'Mean Actor Grad : {}'.format(actor_grad) ) #UPDATE THE PR : if isinstance(self.memory, PrioritizedReplayBuffer) : self.mutex.acquire() loss = torch.abs(actor_loss) + torch.abs(critic_loss) #loss = torch.abs(actor_loss) #+ torch.abs(critic_loss) loss_np = loss.cpu().data.numpy() for (idx, new_error) in zip(batch.idx,loss_np) : new_priority = self.memory.priority(new_error) #print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) ) self.memory.update(idx,new_priority) self.mutex.release() except Exception as e : bashlogger.debug('error : {}',format(e) ) raise e # soft update : soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) closs = critic_loss.cpu() aloss = actor_loss.cpu() return closs.data.numpy(), aloss.data.numpy(), actor_grad else : raise NotImplemented
def optimize(self,MIN_MEMORY=1e3) : if self.algo == 'ddpg' : try : if len(self.memory) < MIN_MEMORY : return #Create Batch with PR : prioritysum = self.memory.total() randexp = np.random.random(size=self.batch_size)*prioritysum batch = list() for i in range(self.batch_size): try : el = self.memory.get(randexp[i]) batch.append(el) except TypeError as e : continue #print('REPLAY BUFFER EXCEPTION...') # Create Batch with replayMemory : batch = TransitionPR( *zip(*batch) ) next_state_batch = Variable(torch.cat( batch.next_state), requires_grad=False) state_batch = Variable( torch.cat( batch.state) , requires_grad=False) action_batch = Variable( torch.cat( batch.action) , requires_grad=False) reward_batch = Variable( torch.cat( batch.reward ), requires_grad=False ).view((-1,1)) if self.use_cuda : next_state_batch = next_state_batch.cuda() state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() #before optimization : self.optimizer.zero_grad() # Critic : # sample action from next_state, without gradient repercusion : next_taction = self.target_NN.actor(next_state_batch).detach() # evaluate the next state action over the target, without repercusion (faster...) : next_tqsa = torch.squeeze( self.target_NN.critic( next_state_batch, next_taction).detach() ) # Supervise loss : ## y_true : y_true = reward_batch + self.gamma*next_tqsa ## y_pred : y_pred = torch.squeeze( self.NN.critic(state_batch,action_batch) ) ## loss : critic_loss = F.smooth_l1_loss(y_pred,y_true) #critic_loss.backward() #self.optimizer.step() # Actor : pred_action = self.NN.actor(state_batch) pred_qsa = torch.squeeze( self.target_NN.critic(state_batch, pred_action) ) # loss : actor_loss = -1.0*torch.sum( pred_qsa) #actor_loss.backward() #self.optimizer.step() # optimize both pathway : scalerA = 0.1 scalerV = 10.0 total_loss = scalerA*actor_loss + scalerV*critic_loss total_loss.backward() self.optimizer.step() except Exception as e : bashlogger.debug('error : {}',format(e) ) # soft update : soft_update(self.target_NN, self.NN, self.tau) return critic_loss.cpu().data.numpy(), actor_loss.cpu().data.numpy() else : raise NotImplemented
def optimize(self) : ''' self.target_critic.eval() self.target_actor.eval() self.critic.train() self.actor.train() ''' if self.algo == 'ddpg' : try : if len(self.memory) < self.MIN_MEMORY : return #Create Batch with PR : prioritysum = self.memory.total() randexp = np.random.random(size=self.batch_size)*prioritysum batch = list() for i in range(self.batch_size): try : el = self.memory.get(randexp[i]) batch.append(el) except TypeError as e : continue #print('REPLAY BUFFER EXCEPTION...') # Create Batch with replayMemory : batch = TransitionPR( *zip(*batch) ) next_state_batch = Variable(torch.cat( batch.next_state), requires_grad=False) state_batch = Variable( torch.cat( batch.state) , requires_grad=False) action_batch = Variable( torch.cat( batch.action) , requires_grad=False) reward_batch = Variable( torch.cat( batch.reward ), requires_grad=False ).view((-1)) ''' next_state_batch = Variable(torch.cat( batch.next_state) ) state_batch = Variable( torch.cat( batch.state) ) action_batch = Variable( torch.cat( batch.action) ) reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1)) ''' if self.use_cuda : next_state_batch = next_state_batch.cuda() state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # Actor : #before optimization : self.optimizer_actor.zero_grad() #predict action : pred_action = self.actor(state_batch) #predict associated qvalues : pred_qsa = self.critic(state_batch, pred_action) # loss : actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) ) actor_loss.backward() #clamping : torch.nn.utils.clip_grad_norm(self.actor.parameters(),0.05) self.optimizer_actor.step() # Critic : #before optimization : self.optimizer_critic.zero_grad() # sample action from next_state, without gradient repercusion : next_taction = self.target_actor(next_state_batch).detach() # evaluate the next state action over the target, without repercusion (faster...) : next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1)) # Supervise loss : ## y_true : y_true = reward_batch + self.gamma*next_tqsa ## y_pred : y_pred = torch.squeeze( self.critic(state_batch,action_batch) ) ## loss : #critic_loss = F.smooth_l1_loss(y_pred,y_true) criterion = nn.MSELoss() critic_loss = criterion(y_pred,y_true) critic_loss.backward() #clamping : torch.nn.utils.clip_grad_norm(self.critic.parameters(),0.5) self.optimizer_critic.step() ''' critic_grad = 0.0 for p in self.critic.parameters() : critic_grad += np.mean(p.grad.cpu().data.numpy()) print( 'Mean Critic Grad : {}'.format(critic_grad) ) ''' actor_grad = 0.0 for p in self.actor.parameters() : actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) ) #print( 'Mean Actor Grad : {}'.format(actor_grad) ) #UPDATE THE PR : loss = torch.abs(actor_loss) + torch.abs(critic_loss) #loss = torch.abs(actor_loss) #+ torch.abs(critic_loss) loss_np = loss.cpu().data.numpy() for (idx, new_error) in zip(batch.idx,loss_np) : new_priority = self.memory.priority(new_error) #print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) ) self.memory.update(idx,new_priority) except Exception as e : bashlogger.debug('error : {}',format(e) ) # soft update : soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) del batch del next_state_batch del state_batch del action_batch del reward_batch closs = critic_loss.cpu() aloss = actor_loss.cpu() del actor_loss del critic_loss return closs.data.numpy(), aloss.data.numpy(), actor_grad else : raise NotImplemented