示例#1
0
    def __init__(self, env, num_inputs, action_space, args, running_state=None):
        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.args = args
        self.env = env
        self.running_state = running_state
        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = args.device

        self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
示例#2
0
	def __init__(self, actor, critic, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE,MIN_MEMORY=1e3 ) :
		self.actor = actor
		self.critic = critic
		self.target_actor = copy.deepcopy(actor)
		self.target_critic = copy.deepcopy(critic)

		self.use_cuda = use_cuda
		if self.use_cuda :
			self.actor = self.actor.cuda()
			self.target_actor = self.target_actor.cuda()
			self.critic = self.critic.cuda()
			self.target_critic = self.target_critic.cuda()


		self.memory = memory

		self.gamma = GAMMA
		self.lr = LR
		self.tau = TAU
		self.batch_size = BATCH_SIZE
		self.MIN_MEMORY = MIN_MEMORY

		self.mutex = Lock()
		
		self.noise = OrnsteinUhlenbeckNoise(self.actor.action_dim)

		hard_update(self.target_actor, self.actor)
		hard_update(self.target_critic, self.critic)

		self.algo = algo
		if self.algo == 'pddpg' :
			self.previous_actor = copy.deepcopy(self.actor)
			self.epsilon = 0.2
			if self.use_cuda :
				self.previous_actor = self.previous_actor.cuda()
示例#3
0
	def __init__(self, actor, critic, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE,MIN_MEMORY=1e3 ) :
		self.actor = actor
		self.critic = critic
		self.target_actor = copy.deepcopy(actor)
		self.target_critic = copy.deepcopy(critic)

		self.use_cuda = use_cuda
		if self.use_cuda :
			self.actor = self.actor.cuda()
			self.target_actor = self.target_actor.cuda()
			self.critic = self.critic.cuda()
			self.target_critic = self.target_critic.cuda()


		self.memory = memory

		self.gamma = GAMMA
		self.lr = LR
		self.tau = TAU
		self.batch_size = BATCH_SIZE
		self.MIN_MEMORY = MIN_MEMORY

		self.optimizer_actor = optim.Adam(self.actor.parameters(), self.lr)
		self.optimizer_critic = optim.Adam(self.critic.parameters(), self.lr*1e1)

		self.noise = OrnsteinUhlenbeckNoise(self.actor.action_dim)

		hard_update(self.target_actor, self.actor)
		hard_update(self.target_critic, self.critic)

		self.algo = algo
示例#4
0
	def __init__(self,index,model,env,memory,lr=1e-3,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10,TAU=1e-3) :
		self.index = index
		self.model = model

		self.wmodel = copy.deepcopy(model)
		hard_update(self.wmodel,self.model)
		global use_cuda
		if use_cuda :
				self.wmodel = self.wmodel.cuda()
			
		self.envstr = env
		self.env = gym.make(self.envstr)
		self.env.reset()

		self.memory = memory
		self.lr = lr
		self.TAU = TAU

		self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr )
		bashlogger.info('Optimizer {}: ok.'.format(self.index) )

		self.preprocess = preprocess
		self.path = path
		self.frompath = frompath
		self.num_episodes = num_episodes
		self.epsend = epsend
		self.epsstart = epsstart
		self.epsdecay = epsdecay

		self.sl = statsLogger(path=self.path,filename='logs{}.csv'.format(self.index) )
		self.workerfn = lambda: self.train(model=self.wmodel,
										env=self.env,
										memory=self.memory,
										optimizer=self.optimizer,
										logger=self.sl,
										preprocess=self.preprocess,
										path=self.path,
										frompath=self.frompath,
										num_episodes=self.num_episodes,
										epsend=self.epsend,
										epsstart=self.epsstart,
										epsdecay=self.epsdecay)

		self.thread = threading.Thread(target=self.workerfn)
示例#5
0
    def from_worker2model(self):
        self.model.lock()

        self.optimizer.zero_grad()

        decay_loss = 0.5 * sum(
            [torch.mean(param * param) for param in self.model.parameters()])
        decay_loss.backward()

        for wparam, mparam in zip(self.wmodel.parameters(),
                                  self.model.parameters()):
            if mparam.grad is not None:
                mparam.grad = mparam.grad + wparam.grad

        self.optimizer.step()

        #update wmodel :
        hard_update(self.wmodel, self.model)

        #zero the working model gradients :
        self.wmodel.zero_grad()

        self.model.unlock()
示例#6
0
	def __init__(self, NN, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE ) :
		self.NN = NN
		self.target_NN = copy.deepcopy(NN)
		
		self.use_cuda = use_cuda
		if self.use_cuda :
			self.NN = self.NN.cuda()
			self.target_NN = self.target_NN.cuda()

		self.memory = memory

		self.gamma = GAMMA
		self.lr = LR
		self.tau = TAU
		self.batch_size = BATCH_SIZE

		self.optimizer = optim.Adam(self.NN.parameters(), self.lr)

		self.noise = OrnsteinUhlenbeckNoise(self.NN.action_dim)

		hard_update(self.target_NN,self.NN )

		self.algo = algo
示例#7
0
	def __init__(self, NN, memory, algo='ddpg',GAMMA=GAMMA,LR=LR,TAU=TAU,use_cuda=USE_CUDA,BATCH_SIZE=BATCH_SIZE,MIN_MEMORY=1e3 ) :
		self.NN = NN
		self.target_NN = copy.deepcopy(NN)
		
		self.use_cuda = use_cuda
		if self.use_cuda :
			self.NN = self.NN.cuda()
			self.target_NN = self.target_NN.cuda()

		self.memory = memory

		self.gamma = GAMMA
		self.lr = LR
		self.tau = TAU
		self.batch_size = BATCH_SIZE
		self.MIN_MEMORY = MIN_MEMORY

		self.mutex = Lock()
		
		self.noise = OrnsteinUhlenbeckNoise(self.NN.action_dim)

		hard_update(self.target_NN, self.NN )

		self.algo = algo
示例#8
0
    def __init__(self, state_dim, action_dim, option_dim, max_action,
                 action_space):

        self.alpha = 0.2
        self.lr = 0.0003
        self.option_num = option_dim

        self.policy_type = "Gaussian"
        self.target_update_interval = 1
        self.automatic_entropy_tuning = True

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        """ critic network """
        self.critic = QNetwork(state_dim, action_dim,
                               400).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(state_dim, action_dim,
                                      400).to(self.device)
        hard_update(self.critic_target, self.critic)

        self.sampling_prob = torch.FloatTensor(state).to(self.device)
        # ===================================================================== #
        #                              Option Model                             #
        # ===================================================================== #
        self.option_state_input, self.option_action_input, self.option_input_concat, self.option_out_dec, \
                                self.option_out, self.option_out_noise, self.option_model = self.create_option_model()
        Advantage = np.stop_gradient(self.target_q_value -
                                     self.predicted_v_value)
        Weight = np.divide(np.exp(Advantage - np.max(Advantage)),
                           self.sampling_prob)
        W_norm = Weight / K.mean(Weight)

        critic_conditional_entropy = weighted_entropy(self.option_out,
                                                      tf.stop_gradient(W_norm))
        p_weighted_ave = weighted_mean(self.option_out,
                                       tf.stop_gradient(W_norm))
        self.critic_entropy = critic_conditional_entropy - self.c_ent * entropy(
            p_weighted_ave)

        self.vat_loss = kl(self.option_out, self.option_out_noise)
        self.reg_loss = metrics.mean_absolute_error(self.option_input_concat,
                                                    self.option_out_dec)
        self.option_loss = self.reg_loss + self.entropy_coeff * (
            self.critic_entropy) + self.c_reg * self.vat_loss
        self.option_optimize = tf.train.AdamOptimizer(self.option_lr).minimize(
            self.option_loss)
        """ option network """
        self.it = 0

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(state_dim, action_dim, 400,
                                         max_action).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        elif self.policy_type == "Multi_Gaussian":
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(state_dim, action_dim, 400,
                                         max_action).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(state_dim, action_dim, 400,
                                              max_action).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)
示例#9
0
    def train(self,
              model,
              env,
              memory,
              optimizer,
              logger=None,
              preprocess=T.ToTensor(),
              path=None,
              frompath=None,
              num_episodes=1000,
              epsend=0.05,
              epsstart=0.9,
              epsdecay=200,
              k=4,
              strategy='future',
              singlegoal=False):
        try:
            episode_durations = []
            episode_reward = []
            episode_loss = []
            global rendering
            global use_cuda
            global MAX_STEPS
            #exploration counter ;
            steps_done = [0]

            #Double Network initialization :
            savemodel(model, path + '.save')
            model_ = copy.deepcopy(model)
            hard_update(model_, model)
            model_.eval()

            if use_cuda:
                model_ = model_.cuda()

            self.accumulateMemory(memory,
                                  env,
                                  model,
                                  preprocess,
                                  epsstart=0.5,
                                  epsend=0.3,
                                  epsdecay=200,
                                  k=k,
                                  strategy=strategy)

            for i in range(num_episodes):
                bashlogger.info('Episode : {} : memory : {}/{}'.format(
                    i, len(memory), memory.capacity))
                cumul_reward = 0.0
                last_screen = get_screen_reset(env, preprocess=preprocess)
                current_screen, reward, done, info = get_screen(
                    env, env.action_space.sample(), preprocess=preprocess)
                state = current_screen - last_screen

                episode_buffer = []
                meanfreq = 0
                episode_loss_buffer = []

                #HER : sample initial goal :
                if not singlegoal:
                    init_goal = sample_init_goal(memory)
                else:
                    init_goal = torch.zeros(current_screen.size())

                showcount = 0

                for t in count():
                    #model.eval()

                    #HER :
                    stategoal = torch.cat([state, init_goal], dim=1)

                    #action = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
                    action = select_action(model,
                                           stategoal,
                                           steps_done=steps_done,
                                           epsend=epsend,
                                           epsstart=epsstart,
                                           epsdecay=epsdecay)

                    last_screen = current_screen
                    current_screen, reward, done, info = get_screen(
                        env, action[0, 0], preprocess=preprocess)
                    cumul_reward += reward

                    if rendering:
                        if showcount >= 10:
                            showcount = 0
                            render(current_screen)  #env.render()
                        else:
                            showcount += 1

                    if not done:
                        next_state = current_screen - last_screen
                    else:
                        next_state = torch.zeros(current_screen.size())

                    episode_buffer.append(
                        EXP(state, action, next_state, reward, done))

                    state = next_state

                    # OPTIMIZE MODEL :
                    since = time.time()
                    lossnp = self.optimize_model(model, model_, memory,
                                                 optimizer)
                    if lossnp is not None:
                        episode_loss_buffer.append(np.mean(lossnp))
                    else:
                        episode_loss_buffer.append(0)

                    # SOFT UPDATE :
                    soft_update(model_, model, self.TAU)

                    elt = time.time() - since
                    f = 1.0 / elt
                    meanfreq = (meanfreq * (t + 1) + f) / (t + 2)

                    if done or t > MAX_STEPS:
                        self.from_worker2model()
                        '''
						nbrTrain = 200
						for it in range(nbrTrain) :
							since = time.time()
							lossnp = optimize_model(model,model_,memory,optimizer)
							if lossnp is not None :
								episode_loss_buffer.append(  np.mean(lossnp) )
							else :
								episode_loss_buffer.append(0)
							
							elt = time.time() - since
							f = 1.0/elt
							meanfreq = (meanfreq*(it+1) + f)/(it+2)
							#print('{} Hz ; {} seconds.'.format(f,elt) )
						'''
                        episode_durations.append(t + 1)
                        episode_reward.append(cumul_reward)
                        meanloss = np.mean(episode_loss_buffer)
                        episode_loss.append(meanloss)

                        log = 'Episode duration : {}'.format(
                            t + 1
                        ) + '---' + ' Reward : {} // Mean Loss : {}'.format(
                            cumul_reward,
                            meanloss) + '---' + ' {}Hz'.format(meanfreq)
                        bashlogger.info(log)
                        if logger is not None:
                            new = {
                                'episodes': [i],
                                'duration': [t + 1],
                                'reward': [cumul_reward],
                                'mean frequency': [meanfreq],
                                'loss': [meanloss]
                            }
                            logger.append(new)

                        if path is not None:
                            # SAVE THE MAIN MODEL :
                            self.model.lock()
                            savemodel(self.model, path + '.save')
                            self.model.unlock()
                            bashlogger.info('Model saved : {}'.format(path))
                        #plot_durations()
                        break

                #Let us add this episode_buffer to the replayBuffer :
                for itexp in range(len(episode_buffer)):
                    el = episode_buffer[itexp]
                    #HER : reward with init_goal
                    HERreward = reward_function(el.state, init_goal)
                    reward = HERreward + el.reward

                    #store this transition with init_goal :
                    init_el = EXP(
                        state=torch.cat([el.state, init_goal], dim=1),
                        action=el.action,
                        next_state=torch.cat([el.next_state, init_goal],
                                             dim=1),
                        reward=reward,
                        done=el.done)

                    init_priority = memory.priority(
                        torch.abs(init_el.reward).numpy())

                    memory.add(init_el, init_priority)

                    #store for multiple goals :
                    #1: sample new goal :
                    goals = []
                    for j in range(k):
                        goal = None
                        if strategy == 'final':
                            goal = sample_goal(episode_buffer,
                                               strategy=strategy)
                        elif strategy == 'future':
                            # watch out for the empty set...
                            index = min(len(episode_buffer) - 3, itexp)
                            goal = sample_goal(episode_buffer, strategy=index)
                        goals.append(goal)

                    #For each goal ...
                    for goal in goals:
                        #2: .. compute reward :
                        goalreward = reward_function(el.state,
                                                     goal) + el.reward
                        #3: ... store this transistion with goal :
                        goalel = EXP(state=torch.cat([el.state, goal], dim=1),
                                     action=el.action,
                                     next_state=torch.cat(
                                         [el.next_state, goal], dim=1),
                                     reward=goalreward,
                                     done=el.done)

                        init_priority = memory.priority(
                            torch.abs(goalel.reward).numpy())
                        memory.add(goalel, init_priority)

                    del el
                    del goals

                del episode_buffer

            bashlogger.info('Complete')
            if path is not None:
                savemodel(model, path + '.save')
                bashlogger.info('Model saved : {}'.format(path))

            env.close()
        except Exception as e:
            bashlogger.exception(e)
示例#10
0
	def train(self,model,env,memory,optimizer,logger=None,preprocess=T.ToTensor(),path=None,frompath=None,num_episodes=1000,epsend=0.05,epsstart=0.9,epsdecay=10): 
		try :
			episode_durations = []
			episode_reward = []
			episode_loss = []
			global rendering
			global use_cuda
			global MAX_STEPS
			#exploration counter ;
			steps_done = [0]
			
			#Double Network initialization :
			savemodel(model,path+'.save')
			#model_ = DuelingDQN(model.nbr_actions)
			model_ = copy.deepcopy(model)
			hard_update(model_,model)
			model_.eval()
			
			if use_cuda :
				model_ = model_.cuda()
				
			for i in range(num_episodes) :
				bashlogger.info('Episode : {} : memory : {}/{}'.format(i,len(memory),memory.capacity) )
				cumul_reward = 0.0
				last_screen = get_screen_reset(env,preprocess=preprocess)
				current_screen, reward, done, info = get_screen(env,env.action_space.sample(),preprocess=preprocess )
				state = current_screen - last_screen
				
				episode_buffer = []
				meanfreq = 0
				episode_loss_buffer = []
				episode_qsa_buffer = []

				
				showcount = 0
				for t in count() :
					
					action,qsa = select_action(model,state,steps_done=steps_done,epsend=epsend,epsstart=epsstart,epsdecay=epsdecay)
					episode_qsa_buffer.append(qsa)
					last_screen = current_screen
					current_screen, reward, done, info = get_screen(env,action[0,0],preprocess=preprocess)
					cumul_reward += reward

					if rendering :
						if showcount >= 10 :
							showcount = 0
							render(current_screen)#env.render()
						else :
							showcount +=1
					
					reward = FloatTensor([reward])

					if not done :
						next_state = current_screen -last_screen
					else :
						next_state = torch.zeros(current_screen.size())

					episode_buffer.append( EXP(state,action,next_state,reward,done) )

					state = next_state

					# OPTIMIZE MODEL :
					since = time.time()		
					lossnp = self.optimize_model(model,model_,memory,optimizer)
					if lossnp is not None :
						episode_loss_buffer.append(  np.mean(lossnp) )
					else :
						episode_loss_buffer.append(0)
						
					# SOFT UPDATE :
					soft_update(model_,model,self.TAU)
				
					elt = time.time() - since
					f = 1.0/elt
					meanfreq = (meanfreq*(t+1) + f)/(t+2)
					
					if done or t > MAX_STEPS:
						self.from_worker2model()

						'''
						nbrTrain = 2
						for tr in range(nbrTrain) :
							since = time.time()		
							lossnp = optimize_model(model,model_,memory,optimizer)
							if lossnp is not None :
								episode_loss_buffer.append(  np.mean(lossnp) )
							else :
								episode_loss_buffer.append(0)
								
							elt = time.time() - since
							f = 1.0/elt
							meanfreq = (meanfreq*(tr+1) + f)/(tr+2)
							#print('{} Hz ; {} seconds.'.format(f,elt) )
						'''	
						episode_durations.append(t+1)
						episode_reward.append(cumul_reward)
						meanloss = np.mean(episode_loss_buffer)
						episode_loss.append(meanloss)
						meanqsa = np.mean(episode_qsa_buffer)


						log = 'Episode duration : {}'.format(t+1) +'---' +' Reward : {} // Mean Loss : {} // QSA : {}'.format(cumul_reward,meanloss,meanqsa) +'---'+' {}Hz'.format(meanfreq)
						bashlogger.info(log)
						if logger is not None :
							new = {'episodes':[i],'duration':[t+1],'reward':[cumul_reward],'mean frequency':[meanfreq],'loss':[meanloss]}
							logger.append(new)

						if path is not None :
							# SAVE THE MAIN MODEL :
							self.model.lock()
							savemodel(self.model,path+'.save')
							self.model.unlock()
							bashlogger.info('Model saved : {}'.format(path) )
						#plot_durations()
						break


				#Let us add this episode_buffer to the replayBuffer :
				for el in episode_buffer :
					init_priority = memory.priority( torch.abs(el.reward).cpu().numpy() )
					memory.add(el,init_priority)
				del episode_buffer

			bashlogger.info('Complete')
			if path is not None :
				savemodel(model,path+'.save')
				bashlogger.info('Model saved : {}'.format(path) )
			
			env.close()
		except Exception as e :
			bashlogger.exception(e)
示例#11
0
    def __init__(self, logger, obs_dim, action_space, userconfig):
        super().__init__(logger=logger,
                         obs_dim=obs_dim,
                         action_dim=4,
                         userconfig=userconfig)
        self.action_space = action_space
        self.device = userconfig['device']
        self.alpha = userconfig['alpha']
        self.automatic_entropy_tuning = self._config[
            'automatic_entropy_tuning']
        self.eval_mode = False

        if self._config['lr_milestones'] is None:
            raise ValueError(
                'lr_milestones argument cannot be None!\nExample: --lr_milestones=100 200 300'
            )

        lr_milestones = [
            int(x) for x in (self._config['lr_milestones'][0]).split(' ')
        ]

        self.actor = ActorNetwork(input_dims=obs_dim,
                                  learning_rate=self._config['learning_rate'],
                                  action_space=self.action_space,
                                  hidden_sizes=[256, 256],
                                  lr_milestones=lr_milestones,
                                  lr_factor=self._config['lr_factor'],
                                  device=self._config['device'])

        self.critic = CriticNetwork(
            input_dim=obs_dim,
            n_actions=4,
            learning_rate=self._config['learning_rate'],
            hidden_sizes=[256, 256],
            lr_milestones=lr_milestones,
            lr_factor=self._config['lr_factor'],
            device=self._config['device'])

        self.critic_target = CriticNetwork(
            input_dim=obs_dim,
            n_actions=4,
            learning_rate=self._config['learning_rate'],
            hidden_sizes=[256, 256],
            lr_milestones=lr_milestones,
            device=self._config['device'])

        hard_update(self.critic_target, self.critic)

        if self.automatic_entropy_tuning:
            milestones = [
                int(x)
                for x in (self._config['alpha_milestones'][0]).split(' ')
            ]
            self.target_entropy = -torch.tensor(4).to(self.device)
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = torch.optim.Adam([self.log_alpha],
                                                lr=self._config['alpha_lr'])
            self.alpha_scheduler = torch.optim.lr_scheduler.MultiStepLR(
                self.alpha_optim, milestones=milestones, gamma=0.5)
示例#12
0
	def optimize(self,optimizer_critic,optimizer_actor) :
		'''
		self.target_critic.eval()
		self.target_actor.eval()
		self.critic.train()
		self.actor.train()
		'''

		if self.algo == 'ddpg' :
			try :
				if len(self.memory) < self.MIN_MEMORY :
					return
				
				#Create Batch 
				self.mutex.acquire()
				
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					#with PR :
					prioritysum = self.memory.total()
					randexp = np.random.random(size=self.batch_size)*prioritysum
					batch = list()
					for i in range(self.batch_size):
						try :
							el = self.memory.get(randexp[i])
							batch.append(el)
						except TypeError as e :
							continue
							#print('REPLAY BUFFER EXCEPTION...')
				else :
					#with random RB :
					batch = self.memory.sample(self.batch_size)

				self.mutex.release()

				if len(batch) == 0 :
					return

				# Create Batch with replayMemory :
				batch = TransitionPR( *zip(*batch) )
				next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False)
				state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False)
				action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False)
				reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1))
				terminal_batch = Variable( torch.cat( batch.done ) )
				
				'''
				next_state_batch = Variable(torch.cat( batch.next_state) )
				state_batch = Variable( torch.cat( batch.state) )
				action_batch = Variable( torch.cat( batch.action) )
				reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1))
				'''
				
				if self.use_cuda :
					next_state_batch = next_state_batch.cuda()
					state_batch = state_batch.cuda()
					action_batch = action_batch.cuda()
					reward_batch = reward_batch.cuda()
					terminal_batch = terminal_batch.cuda()

				
				# Critic :
				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				# sample action from next_state, without gradient repercusion :
				next_taction = self.target_actor(next_state_batch).detach()
				# evaluate the next state action over the target, without repercusion (faster...) :
				next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1))
				# Supervise loss :
				## y_true :
				y_true = reward_batch + (1.0-terminal_batch)*self.gamma*next_tqsa
				#print(torch.cat([y_true.view((-1,1)),terminal_batch.view((-1,1))],dim=1) )

				## y_pred :
				y_pred = torch.squeeze( self.critic(state_batch,action_batch) )
				## loss :
				critic_loss = F.smooth_l1_loss(y_pred,y_true)
				#criterion = nn.MSELoss()
				#critic_loss = criterion(y_pred,y_true)
				critic_loss.backward()
				
				#weight decay :
				decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.critic.parameters()])
				decay_loss.backward()

				#clamping :
				#torch.nn.utils.clip_grad_norm(self.critic.parameters(),50)				
				optimizer_critic.step()
				
				###################################
				'''
				
				# Actor :
				#predict action :
				pred_action = self.actor(state_batch)
				#predict associated qvalues :
				pred_qsa = self.critic(state_batch, pred_action)
				#pred_qsa = self.target_critic(state_batch, pred_action)
				
				# loss :
				actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) )
				
				#actor_loss = F.smooth_l1_loss( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() )
				
				#criterion = nn.MSELoss()
				#actor_loss = criterion( pred_qsa, Variable(torch.zeros(pred_qsa.size() )).cuda() )
				
				#before optimization :
				optimizer_actor.zero_grad()
				self.actor.zero_grad()
				actor_loss.backward()
				#clamping :
				#clampactor = 1e2#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] )
				#torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor)				
				optimizer_actor.step()


				###################################
				'''
				###################################
				
				# Actor :
				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				
				#predict action :
				pred_action = self.actor(state_batch) 
				var_action = Variable( pred_action.cpu().data, requires_grad=True)
				if self.use_cuda :
					var_action_c = var_action.cuda()
					pred_qsa = self.critic(state_batch, var_action_c)
				else :
					pred_qsa = self.critic(state_batch, var_action)
				#predict associated qvalues :
				gradout = torch.ones(pred_qsa.size())
				if self.use_cuda:
					gradout = gradout.cuda()
				pred_qsa.backward( gradout )

				if self.use_cuda :
					gradcritic = var_action.grad.data.cuda()
				else :
					gradcritic = var_action.grad.data

				pred_action.backward( -gradcritic)
				
				#weight decay :
				decay_loss = 0.5*sum( [torch.mean(param*param) for param in self.actor.parameters()])
				decay_loss.backward()

				#clamping :
				clampactor = 5e0#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] )
				torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor)				
				optimizer_actor.step()
				# loss :
				actor_loss = -1.0*torch.mean(torch.sum( pred_qsa) )
				
				###################################

				
				'''
				critic_grad = 0.0
				for p in self.critic.parameters() :
					critic_grad += np.mean(p.grad.cpu().data.numpy())
				print( 'Mean Critic Grad : {}'.format(critic_grad) )
				'''
				
				actor_grad = 0.0
				for p in self.actor.parameters() :
					actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) )
				#print( 'Mean Actor Grad : {}'.format(actor_grad) )
				

				#UPDATE THE PR :
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					self.mutex.acquire()
					loss = torch.abs(actor_loss) + torch.abs(critic_loss)
					#loss = torch.abs(actor_loss) #+ torch.abs(critic_loss)
					loss_np = loss.cpu().data.numpy()
					for (idx, new_error) in zip(batch.idx,loss_np) :
						new_priority = self.memory.priority(new_error)
						#print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) )
						self.memory.update(idx,new_priority)
					self.mutex.release()

			except Exception as e :
				bashlogger.debug('error : {}',format(e) )
				raise e

			# soft update :
			soft_update(self.target_critic, self.critic, self.tau)
			soft_update(self.target_actor, self.actor, self.tau)

			
			closs = critic_loss.cpu()
			aloss = actor_loss.cpu()
			
			return closs.data.numpy(), aloss.data.numpy(), actor_grad

		elif self.algo == 'pddpg' :
			try :
				if len(self.memory) < self.MIN_MEMORY :
					return
				
				#Create Batch 
				self.mutex.acquire()
				
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					#with PR :
					prioritysum = self.memory.total()
					randexp = np.random.random(size=self.batch_size)*prioritysum
					batch = list()
					for i in range(self.batch_size):
						try :
							el = self.memory.get(randexp[i])
							batch.append(el)
						except TypeError as e :
							continue
							#print('REPLAY BUFFER EXCEPTION...')
				else :
					#with random RB :
					batch = self.memory.sample(self.batch_size)

				self.mutex.release()

				if len(batch) == 0 :
					return

				# Create Batch with replayMemory :
				batch = TransitionPR( *zip(*batch) )
				next_state_batch = Variable(torch.cat( batch.next_state))#, requires_grad=False)
				state_batch = Variable( torch.cat( batch.state) )#, requires_grad=False)
				action_batch = Variable( torch.cat( batch.action) )#, requires_grad=False)
				reward_batch = Variable( torch.cat( batch.reward ) )#, requires_grad=False ).view((-1))
				'''
				next_state_batch = Variable(torch.cat( batch.next_state) )
				state_batch = Variable( torch.cat( batch.state) )
				action_batch = Variable( torch.cat( batch.action) )
				reward_batch = Variable( torch.cat( batch.reward ) ).view((-1,1))
				'''
				
				if self.use_cuda :
					next_state_batch = next_state_batch.cuda()
					state_batch = state_batch.cuda()
					action_batch = action_batch.cuda()
					reward_batch = reward_batch.cuda()

				
				# Critic :
				# sample action from next_state, without gradient repercusion :
				next_taction = self.target_actor(next_state_batch).detach()
				# evaluate the next state action over the target, without repercusion (faster...) :
				next_tqsa = torch.squeeze( self.target_critic( next_state_batch, next_taction).detach() ).view((-1))
				# Supervise loss :
				## y_true :
				y_true = reward_batch + self.gamma*next_tqsa 
				## y_pred :
				y_pred = torch.squeeze( self.critic(state_batch,action_batch) )
				## loss :
				critic_loss = F.smooth_l1_loss(y_pred,y_true)
				#criterion = nn.MSELoss()
				#critic_loss = criterion(y_pred,y_true)
				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				critic_loss.backward()
				#clamping :
				#torch.nn.utils.clip_grad_norm(self.critic.parameters(),50)				
				optimizer_critic.step()
				
				
				###################################
				
				# Actor :
				#predict action with old weights :
				pred_old_action = self.previous_actor(state_batch)
				#predict action with current weights :
				pred_action = self.actor(state_batch) 
				
				var_action = Variable( pred_action.cpu().data, requires_grad=True)
				var_old_action = Variable( pred_old_action.cpu().data, requires_grad=True)
				
				if self.use_cuda :
					var_action_c = var_action.cuda()
					var_old_action_c = var_old_action.cuda()
					#predict associated qvalues :
					pred_qsa = self.critic(state_batch, var_action_c)
					pred_old_qsa = self.critic(state_batch, var_old_action_c)
				else :
					#predict associated qvalues :
					pred_qsa = self.critic(state_batch, var_action)
					pred_old_qsa = self.critic(state_batch, var_old_action)
				
				#helper vars :
				clipped_m = (1.0-self.epsilon)#*torch.ones(ratio.size())
				clipped_p = (1.0+self.epsilon)#*torch.ones(ratio.size())
				gradout = torch.ones(pred_qsa.size())
				if self.use_cuda:
					gradout = gradout.cuda()
					
				#compute ratios :
				ratio = pred_qsa/pred_old_qsa
				clipped_ratio = ratio.clamp(clipped_m,clipped_p)
				p_actor_loss = torch.min(ratio,clipped_ratio)
				p_actor_loss.backward( gradout )

				#before optimization :
				self.critic.zero_grad()
				self.actor.zero_grad()
				optimizer_critic.zero_grad()
				optimizer_actor.zero_grad()
				if self.use_cuda :
					gradcritic = var_action.grad.data.cuda()
					pred_action.backward( -gradcritic)
				else :
					pred_action.backward( -var_action.grad.data)
				#clamping :
				#clampactor = 5e1#np.max( [ 0.25, 1.0/np.max( [ 5e-1, np.abs( np.mean(critic_loss.cpu().data.numpy() ) ) ] ) ] )
				#torch.nn.utils.clip_grad_norm(self.actor.parameters(),clampactor)				
				
				#proximal update, before the update of the weights :
				hard_update(self.previous_actor, self.actor)
				#update of the weights :
				optimizer_actor.step()
				# loss :
				actor_loss = -1.0*torch.mean( pred_qsa )
				
				###################################

				
				'''
				critic_grad = 0.0
				for p in self.critic.parameters() :
					critic_grad += np.mean(p.grad.cpu().data.numpy())
				print( 'Mean Critic Grad : {}'.format(critic_grad) )
				'''
				
				actor_grad = 0.0
				for p in self.actor.parameters() :
					actor_grad += np.max( np.abs(p.grad.cpu().data.numpy() ) )
				#print( 'Mean Actor Grad : {}'.format(actor_grad) )
				

				#UPDATE THE PR :
				if isinstance(self.memory, PrioritizedReplayBuffer) :
					self.mutex.acquire()
					loss = torch.abs(actor_loss) + torch.abs(critic_loss)
					#loss = torch.abs(actor_loss) #+ torch.abs(critic_loss)
					loss_np = loss.cpu().data.numpy()
					for (idx, new_error) in zip(batch.idx,loss_np) :
						new_priority = self.memory.priority(new_error)
						#print( 'prior = {} / {}'.format(new_priority,self.rBuffer.total()) )
						self.memory.update(idx,new_priority)
					self.mutex.release()

			except Exception as e :
				bashlogger.debug('error : {}',format(e) )
				raise e

			# soft update :
			soft_update(self.target_critic, self.critic, self.tau)
			soft_update(self.target_actor, self.actor, self.tau)

			
			closs = critic_loss.cpu()
			aloss = actor_loss.cpu()
			
			return closs.data.numpy(), aloss.data.numpy(), actor_grad

		else :
			raise NotImplemented
示例#13
0
	def load(self, path) :
		self.actor.load_state_dict( torch.load(path+'.actor') )
		hard_update(self.target_actor, self.actor)
		self.critic.load_state_dict( torch.load(path+'.critic') )
		hard_update(self.target_critic, self.critic)
示例#14
0
	def load(self, path) :
		self.NN.load_state_dict( torch.load(path) )
		hard_update(self.target_NN, self.NN)